185 files changed, 19443 insertions, 14257 deletions
diff --git a/test/CodeGen/2008-04-08-NoExceptions.c b/test/CodeGen/2008-04-08-NoExceptions.c
index 1213492d1db0..6528c35b5485 100644
--- a/test/CodeGen/2008-04-08-NoExceptions.c
+++ b/test/CodeGen/2008-04-08-NoExceptions.c
@@ -9,4 +9,4 @@ void g(void) {
 
 // CHECK-NOT: declare void @f() [[NUW]]
 
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/2009-10-20-GlobalDebug.c b/test/CodeGen/2009-10-20-GlobalDebug.c
index 080f02ea7ed4..0d7c759f905e 100644
--- a/test/CodeGen/2009-10-20-GlobalDebug.c
+++ b/test/CodeGen/2009-10-20-GlobalDebug.c
@@ -1,16 +1,20 @@
 // REQUIRES: x86-registered-target
 // RUN: %clang -target i386-apple-darwin10 -flto -S -g %s -o - | FileCheck %s
+
+// CHECK: @main.localstatic = internal global i32 0, align 4, !dbg [[L:![0-9]+]]
+// CHECK: @global = common global i32 0, align 4, !dbg [[G:![0-9]+]]
+
 int global;
 int main() { 
   static int localstatic;
   return 0;
 }
 
-// CHECK: !DIGlobalVariable(name: "localstatic"
-// CHECK-NOT:               linkageName:
-// CHECK-SAME:              line: 5,
-// CHECK-SAME:              variable: i32* @main.localstatic
-// CHECK: !DIGlobalVariable(name: "global"
-// CHECK-NOT:               linkageName:
-// CHECK-SAME:              line: 3,
-// CHECK-SAME:              variable: i32* @global
+// CHECK: [[L]] = !DIGlobalVariableExpression(var: [[LV:.*]])
+// CHECK: [[LV]] = distinct !DIGlobalVariable(name: "localstatic"
+// CHECK-NOT:                                 linkageName:
+// CHECK-SAME:                                line: 9,
+// CHECK: [[G]] = !DIGlobalVariableExpression(var: [[GV:.*]])
+// CHECK: [[GV]] = distinct !DIGlobalVariable(name: "global"
+// CHECK-NOT:                                 linkageName:
+// CHECK-SAME:                                line: 7,
diff --git a/test/CodeGen/2010-08-10-DbgConstant.c b/test/CodeGen/2010-08-10-DbgConstant.c
index cbc1841cf187..68947edd526a 100644
--- a/test/CodeGen/2010-08-10-DbgConstant.c
+++ b/test/CodeGen/2010-08-10-DbgConstant.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -S -emit-llvm -debug-info-kind=limited  %s -o - | FileCheck %s
-// CHECK: !DIGlobalVariable(
+// CHECK: !DIGlobalVariableExpression(var: [[VAR:.*]], expr: [[EXPR:![0-9]+]])
+// CHECK: [[EXPR]] = !DIExpression(DW_OP_constu, 201, DW_OP_stack_value)
 
 static const unsigned int ro = 201;
 void bar(int);
diff --git a/test/CodeGen/3dnow-builtins.c b/test/CodeGen/3dnow-builtins.c
index fdee2a7619a0..26e8700b9a2c 100644
--- a/test/CodeGen/3dnow-builtins.c
+++ b/test/CodeGen/3dnow-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-unknown-unknown -target-feature +3dnowa -emit-llvm -o - -Werror | FileCheck %s -check-prefix=GCC -check-prefix=CHECK
-// RUN: %clang_cc1 %s -triple=x86_64-scei-ps4 -target-feature +3dnowa -emit-llvm -o - -Werror | FileCheck %s -check-prefix=PS4 -check-prefix=CHECK
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=GCC -check-prefix=CHECK
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-scei-ps4 -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=PS4 -check-prefix=CHECK
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/64bit-swiftcall.c b/test/CodeGen/64bit-swiftcall.c
new file mode 100644
index 000000000000..c1f098172371
--- /dev/null
+++ b/test/CodeGen/64bit-swiftcall.c
@@ -0,0 +1,1007 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s
+
+// REQUIRES: aarch64-registered-target,x86-registered-target
+
+#define SWIFTCALL __attribute__((swiftcall))
+#define OUT __attribute__((swift_indirect_result))
+#define ERROR __attribute__((swift_error_result))
+#define CONTEXT __attribute__((swift_context))
+
+// CHECK: [[STRUCT2_RESULT:@.*]] = private {{.*}} constant [[STRUCT2_TYPE:%.*]] { i32 0, i8 0, i8 undef, i8 0, float 0.000000e+00, float 0.000000e+00 }
+
+/*****************************************************************************/
+/****************************** PARAMETER ABIS *******************************/
+/*****************************************************************************/
+
+SWIFTCALL void indirect_result_1(OUT int *arg0, OUT float *arg1) {}
+// CHECK-LABEL: define {{.*}} void @indirect_result_1(i32* noalias sret align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+// TODO: maybe this shouldn't suppress sret.
+SWIFTCALL int indirect_result_2(OUT int *arg0, OUT float *arg1) {  __builtin_unreachable(); }
+// CHECK-LABEL: define {{.*}} i32 @indirect_result_2(i32* noalias align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+typedef struct { char array[1024]; } struct_reallybig;
+SWIFTCALL struct_reallybig indirect_result_3(OUT int *arg0, OUT float *arg1) { __builtin_unreachable(); }
+// CHECK-LABEL: define {{.*}} void @indirect_result_3({{.*}}* noalias sret {{.*}}, i32* noalias align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+SWIFTCALL void context_1(CONTEXT void *self) {}
+// CHECK-LABEL: define {{.*}} void @context_1(i8* swiftself
+
+SWIFTCALL void context_2(void *arg0, CONTEXT void *self) {}
+// CHECK-LABEL: define {{.*}} void @context_2(i8*{{.*}}, i8* swiftself
+
+SWIFTCALL void context_error_1(CONTEXT int *self, ERROR float **error) {}
+// CHECK-LABEL: define {{.*}} void @context_error_1(i32* swiftself{{.*}}, float** swifterror)
+// CHECK:       [[TEMP:%.*]] = alloca float*, align 8
+// CHECK:       [[T0:%.*]] = load float*, float** [[ERRORARG:%.*]], align 8
+// CHECK:       store float* [[T0]], float** [[TEMP]], align 8
+// CHECK:       [[T0:%.*]] = load float*, float** [[TEMP]], align 8
+// CHECK:       store float* [[T0]], float** [[ERRORARG]], align 8
+void test_context_error_1() {
+  int x;
+  float *error;
+  context_error_1(&x, &error);
+}
+// CHECK-LABEL: define void @test_context_error_1()
+// CHECK:       [[X:%.*]] = alloca i32, align 4
+// CHECK:       [[ERROR:%.*]] = alloca float*, align 8
+// CHECK:       [[TEMP:%.*]] = alloca swifterror float*, align 8
+// CHECK:       [[T0:%.*]] = load float*, float** [[ERROR]], align 8
+// CHECK:       store float* [[T0]], float** [[TEMP]], align 8
+// CHECK:       call [[SWIFTCC:swiftcc]] void @context_error_1(i32* swiftself [[X]], float** swifterror [[TEMP]])
+// CHECK:       [[T0:%.*]] = load float*, float** [[TEMP]], align 8
+// CHECK:       store float* [[T0]], float** [[ERROR]], align 8
+
+SWIFTCALL void context_error_2(short s, CONTEXT int *self, ERROR float **error) {}
+// CHECK-LABEL: define {{.*}} void @context_error_2(i16{{.*}}, i32* swiftself{{.*}}, float** swifterror)
+
+/*****************************************************************************/
+/********************************** LOWERING *********************************/
+/*****************************************************************************/
+
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float8 __attribute__((ext_vector_type(8)));
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef double double4 __attribute__((ext_vector_type(4)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int5 __attribute__((ext_vector_type(5)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef char char16 __attribute__((ext_vector_type(16)));
+typedef short short8 __attribute__((ext_vector_type(8)));
+typedef long long long2 __attribute__((ext_vector_type(2)));
+
+#define TEST(TYPE)                       \
+  SWIFTCALL TYPE return_##TYPE(void) {   \
+    TYPE result = {};                    \
+    return result;                       \
+  }                                      \
+  SWIFTCALL void take_##TYPE(TYPE v) {   \
+  }                                      \
+  void test_##TYPE() {                   \
+    take_##TYPE(return_##TYPE());        \
+  }
+
+/*****************************************************************************/
+/*********************************** STRUCTS *********************************/
+/*****************************************************************************/
+
+typedef struct {
+} struct_empty;
+TEST(struct_empty);
+// CHECK-LABEL: define {{.*}} @return_struct_empty()
+// CHECK:   ret void
+// CHECK-LABEL: define {{.*}} @take_struct_empty()
+// CHECK:   ret void
+
+typedef struct {
+  int x;
+  char c0;
+  char c1;
+  float f0;
+  float f1;
+} struct_1;
+TEST(struct_1);
+// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_1() {{.*}}{
+// CHECK:   [[RET:%.*]] = alloca [[STRUCT1:%.*]], align 4
+// CHECK:   [[VAR:%.*]] = alloca [[STRUCT1]], align 4
+// CHECK:   call void @llvm.memset
+// CHECK:   call void @llvm.memcpy
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* %retval to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = load i64, i64* [[GEP1]], align 4
+// CHECK:   [[R0:%.*]] = insertvalue { i64, i64 } undef, i64 [[T0]], 0
+// CHECK:   [[R1:%.*]] = insertvalue { i64, i64 } [[R0]], i64 [[T1]], 1
+// CHECK:   ret { i64, i64 } [[R1]]
+// CHECK: }
+// CHECK-LABEL: define swiftcc void @take_struct_1(i64, i64) {{.*}}{
+// CHECK:   [[V:%.*]] = alloca [[STRUCT1:%.*]], align 4
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* [[V]] to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   store i64 %0, i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   store i64 %1, i64* [[GEP1]], align 4
+// CHECK:   ret void
+// CHECK: }
+// CHECK-LABEL: define void @test_struct_1() {{.*}}{
+// CHECK:   [[AGG:%.*]] = alloca [[STRUCT1:%.*]], align 4
+// CHECK:   [[RET:%.*]] = call swiftcc { i64, i64 } @return_struct_1()
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[E0:%.*]] = extractvalue { i64, i64 } [[RET]], 0
+// CHECK:   store i64 [[E0]], i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[E1:%.*]] = extractvalue { i64, i64 } [[RET]], 1
+// CHECK:   store i64 [[E1]], i64* [[GEP1]], align 4
+// CHECK:   [[CAST2:%.*]] = bitcast [[STRUCT1]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[GEP2:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST2]], i32 0, i32 0
+// CHECK:   [[V0:%.*]] = load i64, i64* [[GEP2]], align 4
+// CHECK:   [[GEP3:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST2]], i32 0, i32 1
+// CHECK:   [[V1:%.*]] = load i64, i64* [[GEP3]], align 4
+// CHECK:   call swiftcc void @take_struct_1(i64 [[V0]], i64 [[V1]])
+// CHECK:   ret void
+// CHECK: }
+
+typedef struct {
+  int x;
+  char c0;
+  __attribute__((aligned(2))) char c1;
+  float f0;
+  float f1;
+} struct_2;
+TEST(struct_2);
+// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_2() {{.*}}{
+// CHECK:   [[RET:%.*]] = alloca [[STRUCT2_TYPE]], align 4
+// CHECK:   [[VAR:%.*]] = alloca [[STRUCT2_TYPE]], align 4
+// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
+// CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTVAR]], {{.*}}[[STRUCT2_RESULT]]
+// CHECK:   [[CASTRET:%.*]] = bitcast {{.*}} [[RET]]
+// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
+// CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTRET]], {{.*}}[[CASTVAR]]
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT2_TYPE]]* [[RET]] to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = load i64, i64* [[GEP1]], align 4
+// CHECK:   [[R0:%.*]] = insertvalue { i64, i64 } undef, i64 [[T0]], 0
+// CHECK:   [[R1:%.*]] = insertvalue { i64, i64 } [[R0]], i64 [[T1]], 1
+// CHECK:   ret { i64, i64 } [[R1]]
+// CHECK: }
+// CHECK-LABEL: define swiftcc void @take_struct_2(i64, i64) {{.*}}{
+// CHECK:   [[V:%.*]] = alloca [[STRUCT:%.*]], align 4
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT]]* [[V]] to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   store i64 %0, i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   store i64 %1, i64* [[GEP1]], align 4
+// CHECK:   ret void
+// CHECK: }
+// CHECK-LABEL: define void @test_struct_2() {{.*}} {
+// CHECK:   [[TMP:%.*]] = alloca [[STRUCT2_TYPE]], align 4
+// CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_struct_2()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[STRUCT2_TYPE]]* [[TMP]] to { i64, i64 }*
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds {{.*}} [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T0:%.*]] = extractvalue { i64, i64 } [[CALL]], 0
+// CHECK:   store i64 [[T0]], i64* [[GEP]], align 4
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds {{.*}} [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T0:%.*]] = extractvalue { i64, i64 } [[CALL]], 1
+// CHECK:   store i64 [[T0]], i64* [[GEP]], align 4
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT2_TYPE]]* [[TMP]] to { i64, i64 }*
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[R0:%.*]] = load i64, i64* [[GEP]], align 4
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[R1:%.*]] = load i64, i64* [[GEP]], align 4
+// CHECK:   call swiftcc void @take_struct_2(i64 [[R0]], i64 [[R1]])
+// CHECK:   ret void
+// CHECK: }
+
+// There's no way to put a field randomly in the middle of an otherwise
+// empty storage unit in C, so that case has to be tested in C++, which
+// can use empty structs to introduce arbitrary padding.  (In C, they end up
+// with size 0 and so don't affect layout.)
+
+// Misaligned data rule.
+typedef struct {
+  char c0;
+  __attribute__((packed)) float f;
+} struct_misaligned_1;
+TEST(struct_misaligned_1)
+// CHECK-LABEL: define swiftcc i64 @return_struct_misaligned_1()
+// CHECK:  [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
+// CHECK:  [[RES:%.*]] = alloca [[STRUCT]], align 1
+// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
+// CHECK:  call void @llvm.memset{{.*}}(i8* [[CAST]], i8 0, i64 5
+// CHECK:  [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
+// CHECK:  [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]], i64 5
+// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
+// CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
+// CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 1
+// CHECK:  ret i64 [[R0]]
+// CHECK:}
+// CHECK-LABEL: define swiftcc void @take_struct_misaligned_1(i64) {{.*}}{
+// CHECK:   [[V:%.*]] = alloca [[STRUCT:%.*]], align 1
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT]]* [[V]] to { i64 }*
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   store i64 %0, i64* [[GEP]], align 1
+// CHECK:   ret void
+// CHECK: }
+// CHECK: define void @test_struct_misaligned_1() {{.*}}{
+// CHECK:   [[AGG:%.*]] = alloca [[STRUCT:%.*]], align 1
+// CHECK:   [[CALL:%.*]] = call swiftcc i64 @return_struct_misaligned_1()
+// CHECK:   [[T0:%.*]] = bitcast [[STRUCT]]* [[AGG]] to { i64 }*
+// CHECK:   [[T1:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[T0]], i32 0, i32 0
+// CHECK:   store i64 [[CALL]], i64* [[T1]], align 1
+// CHECK:   [[T0:%.*]] = bitcast [[STRUCT]]* [[AGG]] to { i64 }*
+// CHECK:   [[T1:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[T0]], i32 0, i32 0
+// CHECK:   [[P:%.*]] = load i64, i64* [[T1]], align 1
+// CHECK:   call swiftcc void @take_struct_misaligned_1(i64 [[P]])
+// CHECK:   ret void
+// CHECK: }
+
+// Too many scalars.
+typedef struct {
+  long long x[5];
+} struct_big_1;
+TEST(struct_big_1)
+
+// CHECK-LABEL: define {{.*}} void @return_struct_big_1({{.*}} noalias sret
+
+// Should not be byval.
+// CHECK-LABEL: define {{.*}} void @take_struct_big_1({{.*}}*{{( %.*)?}})
+
+/*****************************************************************************/
+/********************************* TYPE MERGING ******************************/
+/*****************************************************************************/
+
+typedef union {
+  float f;
+  double d;
+} union_het_fp;
+TEST(union_het_fp)
+// CHECK-LABEL: define swiftcc i64 @return_union_het_fp()
+// CHECK:  [[RET:%.*]] = alloca [[UNION:%.*]], align 8
+// CHECK:  [[RES:%.*]] = alloca [[UNION]], align 8
+// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CAST]]
+// CHECK:  [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
+// CHECK:  [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]]
+// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
+// CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
+// CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 8
+// CHECK:  ret i64 [[R0]]
+// CHECK-LABEL: define swiftcc void @take_union_het_fp(i64) {{.*}}{
+// CHECK:   [[V:%.*]] = alloca [[UNION:%.*]], align 8
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[V]] to { i64 }*
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   store i64 %0, i64* [[GEP]], align 8
+// CHECK:   ret void
+// CHECK: }
+// CHECK-LABEL: define void @test_union_het_fp() {{.*}}{
+// CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 8
+// CHECK:   [[CALL:%.*]] = call swiftcc i64 @return_union_het_fp()
+// CHECK:   [[T0:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64 }*
+// CHECK:   [[T1:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[T0]], i32 0, i32 0
+// CHECK:   store i64 [[CALL]], i64* [[T1]], align 8
+// CHECK:   [[T0:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64 }*
+// CHECK:   [[T1:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[T0]], i32 0, i32 0
+// CHECK:   [[V0:%.*]] = load i64, i64* [[T1]], align 8
+// CHECK:   call swiftcc void @take_union_het_fp(i64 [[V0]])
+// CHECK:   ret void
+// CHECK: }
+
+
+typedef union {
+  float f1;
+  float f2;
+} union_hom_fp;
+TEST(union_hom_fp)
+// CHECK-LABEL: define void @test_union_hom_fp()
+// CHECK:   [[TMP:%.*]] = alloca [[REC:%.*]], align 4
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] float @return_union_hom_fp()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG:{ float }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store float [[CALL]], float* [[T0]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   call [[SWIFTCC]] void @take_union_hom_fp(float [[FIRST]])
+// CHECK:   ret void
+
+typedef union {
+  float f1;
+  float4 fv2;
+} union_hom_fp_partial;
+TEST(union_hom_fp_partial)
+// CHECK: define void @test_union_hom_fp_partial()
+// CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 16
+// CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_union_hom_fp_partial()
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue { i64, i64 } [[CALL]], 0
+// CHECK:   store i64 [[T1]], i64* [[T0]], align 16
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue { i64, i64 } [[CALL]], 1
+// CHECK:   store i64 [[T1]], i64* [[T0]], align 8
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[V0:%.*]] = load i64, i64* [[T0]], align 16
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[V1:%.*]] = load i64, i64* [[T0]], align 8
+// CHECK:   call swiftcc void @take_union_hom_fp_partial(i64 [[V0]], i64 [[V1]])
+// CHECK:   ret void
+// CHECK: }
+
+typedef union {
+  struct { int x, y; } f1;
+  float4 fv2;
+} union_het_fpv_partial;
+TEST(union_het_fpv_partial)
+// CHECK-LABEL: define void @test_union_het_fpv_partial()
+// CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 16
+// CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_union_het_fpv_partial()
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue { i64, i64 } [[CALL]], 0
+// CHECK:   store i64 [[T1]], i64* [[T0]], align 16
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue { i64, i64 } [[CALL]], 1
+// CHECK:   store i64 [[T1]], i64* [[T0]], align 8
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[V0:%.*]] = load i64, i64* [[T0]], align 16
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[V1:%.*]] = load i64, i64* [[T0]], align 8
+// CHECK:   call swiftcc void @take_union_het_fpv_partial(i64 [[V0]], i64 [[V1]])
+// CHECK:   ret void
+// CHECK: }
+
+/*****************************************************************************/
+/****************************** VECTOR LEGALIZATION **************************/
+/*****************************************************************************/
+
+TEST(int4)
+// CHECK-LABEL: define {{.*}} <4 x i32> @return_int4()
+// CHECK-LABEL: define {{.*}} @take_int4(<4 x i32>
+
+TEST(int8)
+// CHECK-LABEL: define {{.*}} @return_int8()
+// CHECK:   [[RET:%.*]] = alloca [[REC:<8 x i32>]], align 16
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
+// CHECK:   store
+// CHECK:   load
+// CHECK:   store
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ <4 x i32>, <4 x i32> }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ <4 x i32>, <4 x i32> }]] undef, <4 x i32> [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], <4 x i32> [[SECOND]], 1
+// CHECK:   ret [[UAGG]] [[T1]]
+// CHECK-LABEL: define {{.*}} @take_int8(<4 x i32>, <4 x i32>)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store <4 x i32> %0, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store <4 x i32> %1, <4 x i32>* [[T0]], align
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_int8()
+// CHECK:   [[TMP1:%.*]] = alloca [[REC]], align
+// CHECK:   [[TMP2:%.*]] = alloca [[REC]], align
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_int8()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP1]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[V:%.*]] = load [[REC]], [[REC]]* [[TMP1]], align
+// CHECK:   store [[REC]] [[V]], [[REC]]* [[TMP2]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP2]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   call [[SWIFTCC]] void @take_int8(<4 x i32> [[FIRST]], <4 x i32> [[SECOND]])
+// CHECK:   ret void
+
+TEST(int5)
+// CHECK-LABEL: define {{.*}} @return_int5()
+// CHECK:   [[RET:%.*]] = alloca [[REC:<5 x i32>]], align 16
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
+// CHECK:   store
+// CHECK:   load
+// CHECK:   store
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ <4 x i32>, i32 }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ <4 x i32>, i32 }]] undef, <4 x i32> [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i32 [[SECOND]], 1
+// CHECK:   ret [[UAGG]] [[T1]]
+// CHECK-LABEL: define {{.*}} @take_int5(<4 x i32>, i32)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store <4 x i32> %0, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store i32 %1, i32* [[T0]], align
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_int5()
+// CHECK:   [[TMP1:%.*]] = alloca [[REC]], align
+// CHECK:   [[TMP2:%.*]] = alloca [[REC]], align
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_int5()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP1]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store i32 [[T1]], i32* [[T0]], align
+// CHECK:   [[V:%.*]] = load [[REC]], [[REC]]* [[TMP1]], align
+// CHECK:   store [[REC]] [[V]], [[REC]]* [[TMP2]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP2]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   call [[SWIFTCC]] void @take_int5(<4 x i32> [[FIRST]], i32 [[SECOND]])
+// CHECK:   ret void
+
+typedef struct {
+  int x;
+  int3 v __attribute__((packed));
+} misaligned_int3;
+TEST(misaligned_int3)
+// CHECK-LABEL: define swiftcc void @take_misaligned_int3(i64, i64)
+
+typedef struct {
+  float f0;
+} struct_f1;
+TEST(struct_f1)
+// CHECK-LABEL: define swiftcc float @return_struct_f1()
+// CHECK-LABEL: define swiftcc void @take_struct_f1(float)
+
+typedef struct {
+  float f0;
+  float f1;
+} struct_f2;
+TEST(struct_f2)
+// CHECK-LABEL: define swiftcc i64 @return_struct_f2()
+// CHECK-LABEL: define swiftcc void @take_struct_f2(i64)
+
+typedef struct {
+  float f0;
+  float f1;
+  float f2;
+} struct_f3;
+TEST(struct_f3)
+// CHECK-LABEL: define swiftcc { i64, float } @return_struct_f3()
+// CHECK-LABEL: define swiftcc void @take_struct_f3(i64, float)
+
+typedef struct {
+  float f0;
+  float f1;
+  float f2;
+  float f3;
+} struct_f4;
+TEST(struct_f4)
+// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_f4()
+// CHECK-LABEL: define swiftcc void @take_struct_f4(i64, i64)
+
+
+typedef struct {
+  double d0;
+} struct_d1;
+TEST(struct_d1)
+// CHECK-LABEL: define swiftcc double @return_struct_d1()
+// CHECK-LABEL: define swiftcc void @take_struct_d1(double)
+
+typedef struct {
+  double d0;
+  double d1;
+} struct_d2;
+TEST(struct_d2)
+
+// CHECK-LABEL: define swiftcc { double, double } @return_struct_d2()
+// CHECK-LABEL: define swiftcc void @take_struct_d2(double, double)
+typedef struct {
+  double d0;
+  double d1;
+  double d2;
+} struct_d3;
+TEST(struct_d3)
+// CHECK-LABEL: define swiftcc { double, double, double } @return_struct_d3()
+// CHECK-LABEL: define swiftcc void @take_struct_d3(double, double, double)
+
+typedef struct {
+  double d0;
+  double d1;
+  double d2;
+  double d3;
+} struct_d4;
+TEST(struct_d4)
+// CHECK-LABEL: define swiftcc { double, double, double, double } @return_struct_d4()
+// CHECK-LABEL: define swiftcc void @take_struct_d4(double, double, double, double)
+
+typedef struct {
+  double d0;
+  double d1;
+  double d2;
+  double d3;
+  double d4;
+} struct_d5;
+TEST(struct_d5)
+// CHECK: define swiftcc void @return_struct_d5([[STRUCT5:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_d5([[STRUCT5]]
+
+typedef struct {
+  char c0;
+} struct_c1;
+TEST(struct_c1)
+// CHECK-LABEL: define swiftcc i8 @return_struct_c1()
+// CHECK-LABEL: define swiftcc void @take_struct_c1(i8)
+
+typedef struct {
+  char c0;
+  char c1;
+} struct_c2;
+TEST(struct_c2)
+// CHECK-LABEL: define swiftcc i16 @return_struct_c2()
+// CHECK-LABEL: define swiftcc void @take_struct_c2(i16)
+//
+
+typedef struct {
+  char c0;
+  char c1;
+  char c2;
+} struct_c3;
+TEST(struct_c3)
+// CHECK-LABEL: define swiftcc i32 @return_struct_c3()
+// CHECK-LABEL: define swiftcc void @take_struct_c3(i32)
+
+typedef struct {
+  char c0;
+  char c1;
+  char c2;
+  char c3;
+} struct_c4;
+TEST(struct_c4)
+// CHECK-LABEL: define swiftcc i32 @return_struct_c4()
+// CHECK-LABEL: define swiftcc void @take_struct_c4(i32)
+
+typedef struct {
+  char c0;
+  char c1;
+  char c2;
+  char c3;
+  char c4;
+} struct_c5;
+TEST(struct_c5)
+// CHECK-LABEL: define swiftcc i64 @return_struct_c5()
+// CHECK-LABEL: define swiftcc void @take_struct_c5(i64)
+//
+typedef struct {
+  char c0;
+  char c1;
+  char c2;
+  char c3;
+  char c4;
+  char c5;
+  char c6;
+  char c7;
+  char c8;
+} struct_c9;
+TEST(struct_c9)
+// CHECK-LABEL: define swiftcc { i64, i8 } @return_struct_c9()
+// CHECK-LABEL: define swiftcc void @take_struct_c9(i64, i8)
+
+typedef struct {
+  short s0;
+} struct_s1;
+TEST(struct_s1)
+// CHECK-LABEL: define swiftcc i16 @return_struct_s1()
+// CHECK-LABEL: define swiftcc void @take_struct_s1(i16)
+
+typedef struct {
+  short s0;
+  short s1;
+} struct_s2;
+TEST(struct_s2)
+// CHECK-LABEL: define swiftcc i32 @return_struct_s2()
+// CHECK-LABEL: define swiftcc void @take_struct_s2(i32)
+//
+
+typedef struct {
+  short s0;
+  short s1;
+  short s2;
+} struct_s3;
+TEST(struct_s3)
+// CHECK-LABEL: define swiftcc i64 @return_struct_s3()
+// CHECK-LABEL: define swiftcc void @take_struct_s3(i64)
+
+typedef struct {
+  short s0;
+  short s1;
+  short s2;
+  short s3;
+} struct_s4;
+TEST(struct_s4)
+// CHECK-LABEL: define swiftcc i64 @return_struct_s4()
+// CHECK-LABEL: define swiftcc void @take_struct_s4(i64)
+
+typedef struct {
+  short s0;
+  short s1;
+  short s2;
+  short s3;
+  short s4;
+} struct_s5;
+TEST(struct_s5)
+// CHECK-LABEL: define swiftcc { i64, i16 } @return_struct_s5()
+// CHECK-LABEL: define swiftcc void @take_struct_s5(i64, i16)
+
+
+typedef struct {
+  int i0;
+} struct_i1;
+TEST(struct_i1)
+// CHECK-LABEL: define swiftcc i32 @return_struct_i1()
+// CHECK-LABEL: define swiftcc void @take_struct_i1(i32)
+
+typedef struct {
+  int i0;
+  int i1;
+} struct_i2;
+TEST(struct_i2)
+// CHECK-LABEL: define swiftcc i64 @return_struct_i2()
+// CHECK-LABEL: define swiftcc void @take_struct_i2(i64)
+
+typedef struct {
+  int i0;
+  int i1;
+  int i2;
+} struct_i3;
+TEST(struct_i3)
+// CHECK-LABEL: define swiftcc { i64, i32 } @return_struct_i3()
+// CHECK-LABEL: define swiftcc void @take_struct_i3(i64, i32)
+
+typedef struct {
+  int i0;
+  int i1;
+  int i2;
+  int i3;
+} struct_i4;
+TEST(struct_i4)
+// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_i4()
+// CHECK-LABEL: define swiftcc void @take_struct_i4(i64, i64)
+
+typedef struct {
+  long long l0;
+} struct_l1;
+TEST(struct_l1)
+// CHECK-LABEL: define swiftcc i64 @return_struct_l1()
+// CHECK-LABEL: define swiftcc void @take_struct_l1(i64)
+
+typedef struct {
+  long long l0;
+  long long l1;
+} struct_l2;
+TEST(struct_l2)
+// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_l2()
+// CHECK-LABEL: define swiftcc void @take_struct_l2(i64, i64)
+
+typedef struct {
+  long long l0;
+  long long l1;
+  long long l2;
+} struct_l3;
+TEST(struct_l3)
+// CHECK-LABEL: define swiftcc { i64, i64, i64 } @return_struct_l3()
+// CHECK-LABEL: define swiftcc void @take_struct_l3(i64, i64, i64)
+
+typedef struct {
+  long long l0;
+  long long l1;
+  long long l2;
+  long long l3;
+} struct_l4;
+TEST(struct_l4)
+// CHECK-LABEL: define swiftcc { i64, i64, i64, i64 } @return_struct_l4()
+// CHECK-LABEL: define swiftcc void @take_struct_l4(i64, i64, i64, i64)
+
+typedef struct {
+  long long l0;
+  long long l1;
+  long long l2;
+  long long l3;
+  long long l4;
+} struct_l5;
+TEST(struct_l5)
+// CHECK: define swiftcc void @return_struct_l5([[STRUCT5:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_l5([[STRUCT5]]*
+
+typedef struct {
+  char16 c0;
+} struct_vc1;
+TEST(struct_vc1)
+// CHECK-LABEL: define swiftcc <16 x i8> @return_struct_vc1()
+// CHECK-LABEL: define swiftcc void @take_struct_vc1(<16 x i8>)
+
+typedef struct {
+  char16 c0;
+  char16 c1;
+} struct_vc2;
+TEST(struct_vc2)
+// CHECK-LABEL: define swiftcc { <16 x i8>, <16 x i8> } @return_struct_vc2()
+// CHECK-LABEL: define swiftcc void @take_struct_vc2(<16 x i8>, <16 x i8>)
+
+typedef struct {
+  char16 c0;
+  char16 c1;
+  char16 c2;
+} struct_vc3;
+TEST(struct_vc3)
+// CHECK-LABEL: define swiftcc { <16 x i8>, <16 x i8>, <16 x i8> } @return_struct_vc3()
+// CHECK-LABEL: define swiftcc void @take_struct_vc3(<16 x i8>, <16 x i8>, <16 x i8>)
+
+typedef struct {
+  char16 c0;
+  char16 c1;
+  char16 c2;
+  char16 c3;
+} struct_vc4;
+TEST(struct_vc4)
+// CHECK-LABEL: define swiftcc { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @return_struct_vc4()
+// CHECK-LABEL: define swiftcc void @take_struct_vc4(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+typedef struct {
+  char16 c0;
+  char16 c1;
+  char16 c2;
+  char16 c3;
+  char16 c4;
+} struct_vc5;
+TEST(struct_vc5)
+// CHECK: define swiftcc void @return_struct_vc5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vc5([[STRUCT]]
+
+typedef struct {
+  short8 c0;
+} struct_vs1;
+TEST(struct_vs1)
+// CHECK-LABEL: define swiftcc <8 x i16> @return_struct_vs1()
+// CHECK-LABEL: define swiftcc void @take_struct_vs1(<8 x i16>)
+
+typedef struct {
+  short8 c0;
+  short8 c1;
+} struct_vs2;
+TEST(struct_vs2)
+// CHECK-LABEL: define swiftcc { <8 x i16>, <8 x i16> } @return_struct_vs2()
+// CHECK-LABEL: define swiftcc void @take_struct_vs2(<8 x i16>, <8 x i16>)
+
+typedef struct {
+  short8 c0;
+  short8 c1;
+  short8 c2;
+} struct_vs3;
+TEST(struct_vs3)
+// CHECK-LABEL: define swiftcc { <8 x i16>, <8 x i16>, <8 x i16> } @return_struct_vs3()
+// CHECK-LABEL: define swiftcc void @take_struct_vs3(<8 x i16>, <8 x i16>, <8 x i16>)
+
+typedef struct {
+  short8 c0;
+  short8 c1;
+  short8 c2;
+  short8 c3;
+} struct_vs4;
+TEST(struct_vs4)
+// CHECK-LABEL: define swiftcc { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @return_struct_vs4()
+// CHECK-LABEL: define swiftcc void @take_struct_vs4(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>)
+
+typedef struct {
+  short8 c0;
+  short8 c1;
+  short8 c2;
+  short8 c3;
+  short8 c4;
+} struct_vs5;
+TEST(struct_vs5)
+// CHECK: define swiftcc void @return_struct_vs5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vs5([[STRUCT]]
+
+typedef struct {
+  int4 c0;
+} struct_vi1;
+TEST(struct_vi1)
+// CHECK-LABEL: define swiftcc <4 x i32> @return_struct_vi1()
+// CHECK-LABEL: define swiftcc void @take_struct_vi1(<4 x i32>)
+
+typedef struct {
+  int4 c0;
+  int4 c1;
+} struct_vi2;
+TEST(struct_vi2)
+// CHECK-LABEL: define swiftcc { <4 x i32>, <4 x i32> } @return_struct_vi2()
+// CHECK-LABEL: define swiftcc void @take_struct_vi2(<4 x i32>, <4 x i32>)
+
+typedef struct {
+  int4 c0;
+  int4 c1;
+  int4 c2;
+} struct_vi3;
+TEST(struct_vi3)
+// CHECK-LABEL: define swiftcc { <4 x i32>, <4 x i32>, <4 x i32> } @return_struct_vi3()
+// CHECK-LABEL: define swiftcc void @take_struct_vi3(<4 x i32>, <4 x i32>, <4 x i32>)
+
+typedef struct {
+  int4 c0;
+  int4 c1;
+  int4 c2;
+  int4 c3;
+} struct_vi4;
+TEST(struct_vi4)
+// CHECK-LABEL: define swiftcc { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @return_struct_vi4()
+// CHECK-LABEL: define swiftcc void @take_struct_vi4(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>)
+
+typedef struct {
+  int4 c0;
+  int4 c1;
+  int4 c2;
+  int4 c3;
+  int4 c4;
+} struct_vi5;
+TEST(struct_vi5)
+// CHECK: define swiftcc void @return_struct_vi5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vi5([[STRUCT]]
+
+typedef struct {
+  long2 c0;
+} struct_vl1;
+TEST(struct_vl1)
+// CHECK-LABEL: define swiftcc <2 x i64> @return_struct_vl1()
+// CHECK-LABEL: define swiftcc void @take_struct_vl1(<2 x i64>)
+
+typedef struct {
+  long2 c0;
+  long2 c1;
+  long2 c2;
+  long2 c3;
+} struct_vl4;
+TEST(struct_vl4)
+// CHECK-LABEL: define swiftcc { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @return_struct_vl4()
+// CHECK-LABEL: define swiftcc void @take_struct_vl4(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
+
+typedef struct {
+  long2 c0;
+  long2 c1;
+  long2 c2;
+  long2 c3;
+  long2 c4;
+} struct_vl5;
+TEST(struct_vl5)
+// CHECK: define swiftcc void @return_struct_vl5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vl5([[STRUCT]]
+
+typedef struct {
+  double2 c0;
+} struct_vd1;
+TEST(struct_vd1)
+// CHECK-LABEL: define swiftcc <2 x double> @return_struct_vd1()
+// CHECK-LABEL: define swiftcc void @take_struct_vd1(<2 x double>)
+
+typedef struct {
+  double2 c0;
+  double2 c1;
+  double2 c2;
+  double2 c3;
+} struct_vd4;
+TEST(struct_vd4)
+// CHECK-LABEL: define swiftcc { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @return_struct_vd4()
+// CHECK-LABEL: define swiftcc void @take_struct_vd4(<2 x double>, <2 x double>, <2 x double>, <2 x double>)
+
+typedef struct {
+  double2 c0;
+  double2 c1;
+  double2 c2;
+  double2 c3;
+  double2 c4;
+} struct_vd5;
+TEST(struct_vd5)
+// CHECK: define swiftcc void @return_struct_vd5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vd5([[STRUCT]]
+
+typedef struct {
+  double4 c0;
+} struct_vd41;
+TEST(struct_vd41)
+// CHECK-LABEL: define swiftcc { <2 x double>, <2 x double> } @return_struct_vd41()
+// CHECK-LABEL: define swiftcc void @take_struct_vd41(<2 x double>, <2 x double>)
+
+typedef struct {
+  double4 c0;
+  double4 c1;
+} struct_vd42;
+TEST(struct_vd42)
+// CHECK-LABEL: define swiftcc { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @return_struct_vd42()
+// CHECK-LABEL: define swiftcc void @take_struct_vd42(<2 x double>, <2 x double>, <2 x double>, <2 x double>)
+
+typedef struct {
+  double4 c0;
+  double4 c1;
+  double4 c2;
+} struct_vd43;
+TEST(struct_vd43)
+// CHECK: define swiftcc void @return_struct_vd43([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vd43([[STRUCT]]
+
+typedef struct {
+  float4 c0;
+} struct_vf1;
+TEST(struct_vf1)
+// CHECK-LABEL: define swiftcc <4 x float> @return_struct_vf1()
+// CHECK-LABEL: define swiftcc void @take_struct_vf1(<4 x float>)
+
+typedef struct {
+  float4 c0;
+  float4 c1;
+} struct_vf2;
+TEST(struct_vf2)
+// CHECK-LABEL: define swiftcc { <4 x float>, <4 x float> } @return_struct_vf2()
+// CHECK-LABEL: define swiftcc void @take_struct_vf2(<4 x float>, <4 x float>)
+
+typedef struct {
+  float4 c0;
+  float4 c1;
+  float4 c2;
+  float4 c3;
+} struct_vf4;
+TEST(struct_vf4)
+// CHECK-LABEL: define swiftcc { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @return_struct_vf4()
+// CHECK-LABEL: define swiftcc void @take_struct_vf4(<4 x float>, <4 x float>, <4 x float>, <4 x float>)
+
+typedef struct {
+  float4 c0;
+  float4 c1;
+  float4 c2;
+  float4 c3;
+  float4 c4;
+} struct_vf5;
+TEST(struct_vf5)
+// CHECK: define swiftcc void @return_struct_vf5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vf5([[STRUCT]]
+
+typedef struct {
+  float8 c0;
+} struct_vf81;
+TEST(struct_vf81)
+// CHECK-LABEL: define swiftcc { <4 x float>, <4 x float> } @return_struct_vf81()
+// CHECK-LABEL: define swiftcc void @take_struct_vf81(<4 x float>, <4 x float>)
+
+// Don't crash.
+typedef union {
+int4 v[2];
+struct {
+  int LSW;
+  int d7;
+  int d6;
+  int d5;
+  int d4;
+  int d3;
+  int d2;
+  int MSW;
+} s;
+} union_het_vecint;
+TEST(union_het_vecint)
+// CHECK: define swiftcc void @return_union_het_vecint([[UNION:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_union_het_vecint([[UNION]]*
diff --git a/test/CodeGen/CFStrings.c b/test/CodeGen/CFStrings.c
index 17de39c6aed8..4edb5ff26e60 100644
--- a/test/CodeGen/CFStrings.c
+++ b/test/CodeGen/CFStrings.c
@@ -13,6 +13,10 @@
 // RUN: %clang_cc1 -triple i386-apple-macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
 // RUN: %clang_cc1 -triple x86_64-macho -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO64
 
+// RUN: %clang_cc1 -triple thumbv7-windows -S %s -o - | FileCheck %s -check-prefix CHECK-ASM-COFF
+// RUN: %clang_cc1 -triple thumbv7-elf -S %s -o - | FileCheck %s -check-prefix CHECK-ASM-ELF
+// RUN: %clang_cc1 -triple thumbv7-macho -S %s -o - | FileCheck %s -check-prefix CHECK-ASM-MACHO
+
 typedef struct __CFString *CFStringRef;
 const CFStringRef one = (CFStringRef)__builtin___CFStringMakeConstantString("one");
 const CFStringRef two = (CFStringRef)__builtin___CFStringMakeConstantString("\xef\xbf\xbd\x74\xef\xbf\xbd\x77\xef\xbf\xbd\x6f");
@@ -21,21 +25,21 @@ const CFStringRef two = (CFStringRef)__builtin___CFStringMakeConstantString("\xe
 // CHECK-ELF: @.str = private unnamed_addr constant [4 x i8] c"one\00", align 1
 // CHECK-MACHO: @.str = private unnamed_addr constant [4 x i8] c"one\00", section "__TEXT,__cstring,cstring_literals", align 1
 
-// CHECK-COFF: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "cfstring", align {{[48]}}
-// CHECK-ELF32: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "cfstring", align 4
-// CHECK-ELF64: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 3 }, section "cfstring", align 8
-// CHECK-MACHO32: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "__DATA,__cfstring", align 4
-// CHECK-MACHO64: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 3 }, section "__DATA,__cfstring", align 8
+// CHECK-COFF: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "cfstring", align {{[48]}}
+// CHECK-ELF32: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "cfstring", align 4
+// CHECK-ELF64: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 3 }, section "cfstring", align 8
+// CHECK-MACHO32: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "__DATA,__cfstring", align 4
+// CHECK-MACHO64: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 3 }, section "__DATA,__cfstring", align 8
 
 // CHECK-COFF: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], align 2
 // CHECK-ELF: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], align 2
 // CHECK-MACHO: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], section "__TEXT,__ustring", align 2
 
-// CHECK-COFF: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "cfstring", align {{[48]}}
-// CHECK-ELF32: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "cfstring", align 4
-// CHECK-ELF64: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i64 6 }, section "cfstring", align 8
-// CHECK-MACHO32: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "__DATA,__cfstring", align 4
-// CHECK-MACHO64: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i64 6 }, section "__DATA,__cfstring", align 8
+// CHECK-COFF: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "cfstring", align {{[48]}}
+// CHECK-ELF32: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "cfstring", align 4
+// CHECK-ELF64: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i64 6 }, section "cfstring", align 8
+// CHECK-MACHO32: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "__DATA,__cfstring", align 4
+// CHECK-MACHO64: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i64 6 }, section "__DATA,__cfstring", align 8
 
 // CHECK-ELF-DATA-SECTION: .section .rodata.str1.1
 // CHECK-ELF-DATA-SECTION: .asciz "one"
@@ -49,3 +53,7 @@ const CFStringRef two = (CFStringRef)__builtin___CFStringMakeConstantString("\xe
 // CHECK-ELF-DATA-SECTION: .short 111
 // CHECK-ELF-DATA-SECTION: .short 0
 
+// CHECK-ASM-COFF: .section cfstring,"dw"
+// CHECK-ASM-ELF: .section cfstring,"aw"
+// CHECK-ASM-MACHO: .section __DATA,__cfstring
+
diff --git a/test/CodeGen/Inputs/debug-info-file-checksum.c b/test/CodeGen/Inputs/debug-info-file-checksum.c
new file mode 100644
index 000000000000..081f6f0c2891
--- /dev/null
+++ b/test/CodeGen/Inputs/debug-info-file-checksum.c
@@ -0,0 +1,3 @@
+int foo(int x) {
+  return x+1;
+}
diff --git a/test/CodeGen/Inputs/opt-record.proftext b/test/CodeGen/Inputs/opt-record.proftext
new file mode 100644
index 000000000000..cf2530c6be54
--- /dev/null
+++ b/test/CodeGen/Inputs/opt-record.proftext
@@ -0,0 +1,26 @@
+foo
+# Func Hash:
+0
+# Num Counters:
+1
+# Counter Values:
+30
+
+bar
+# Func Hash:
+0
+# Num Counters:
+1
+# Counter Values:
+30
+
+Test
+# Func Hash:
+269
+# Num Counters:
+3
+# Counter Values:
+1
+30
+15
+
diff --git a/test/CodeGen/aarch64-neon-2velem.c b/test/CodeGen/aarch64-neon-2velem.c
index 36500f62a5d9..2866990433df 100644
--- a/test/CodeGen/aarch64-neon-2velem.c
+++ b/test/CodeGen/aarch64-neon-2velem.c
@@ -4,7 +4,7 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmla_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -13,7 +13,7 @@ int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
   return vmla_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -22,7 +22,7 @@ int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
   return vmlaq_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmla_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -31,7 +31,7 @@ int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
   return vmla_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -40,7 +40,7 @@ int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
   return vmlaq_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmla_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -49,7 +49,7 @@ int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
   return vmla_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -58,7 +58,7 @@ int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
   return vmlaq_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmla_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -67,7 +67,7 @@ int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
   return vmla_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -76,7 +76,7 @@ int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
   return vmlaq_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmls_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -85,7 +85,7 @@ int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
   return vmls_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -94,7 +94,7 @@ int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
   return vmlsq_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmls_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -103,7 +103,7 @@ int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
   return vmls_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -112,7 +112,7 @@ int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
   return vmlsq_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmls_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -121,7 +121,7 @@ int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
   return vmls_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -130,7 +130,7 @@ int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
   return vmlsq_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmls_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -139,7 +139,7 @@ int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
   return vmls_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -148,7 +148,7 @@ int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
   return vmlsq_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -156,7 +156,7 @@ int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
   return vmul_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -164,7 +164,7 @@ int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
   return vmulq_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -172,7 +172,7 @@ int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
   return vmul_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -180,7 +180,7 @@ int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
   return vmulq_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -188,7 +188,7 @@ uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
   return vmul_lane_u16(a, v, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -196,7 +196,7 @@ uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
   return vmulq_lane_u16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -204,7 +204,7 @@ uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
   return vmul_lane_u32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -212,7 +212,7 @@ uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
   return vmulq_lane_u32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -220,7 +220,7 @@ int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
   return vmul_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -228,7 +228,7 @@ int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
   return vmulq_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -236,7 +236,7 @@ int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
   return vmul_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -244,7 +244,7 @@ int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
   return vmulq_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -252,7 +252,7 @@ uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
   return vmul_laneq_u16(a, v, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -260,7 +260,7 @@ uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
   return vmulq_laneq_u16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -268,7 +268,7 @@ uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
   return vmul_laneq_u32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -276,7 +276,7 @@ uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
   return vmulq_laneq_u32(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vfma_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
@@ -290,7 +290,7 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vfma_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmaq_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
@@ -304,7 +304,7 @@ float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vfmaq_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfma_laneq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
@@ -318,7 +318,7 @@ float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vfma_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmaq_laneq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
@@ -332,7 +332,7 @@ float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmaq_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vfms_lane_f32(
 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
@@ -347,7 +347,7 @@ float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vfms_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmsq_lane_f32(
 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
@@ -362,7 +362,7 @@ float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vfmsq_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfms_laneq_f32(
 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
@@ -377,7 +377,7 @@ float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vfms_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmsq_laneq_f32(
 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
@@ -392,7 +392,7 @@ float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmsq_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
+// CHECK-LABEL: @test_vfmaq_lane_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
@@ -406,7 +406,7 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
   return vfmaq_lane_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vfmaq_laneq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
@@ -420,7 +420,7 @@ float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
   return vfmaq_laneq_f64(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
+// CHECK-LABEL: @test_vfmsq_lane_f64(
 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
@@ -435,7 +435,7 @@ float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
   return vfmsq_lane_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vfmsq_laneq_f64(
 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
@@ -450,7 +450,7 @@ float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
   return vfmsq_laneq_f64(a, b, v, 1);
 }
 
-// CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmas_laneq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 // CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
@@ -460,7 +460,7 @@ float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
   return vfmas_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 {
+// CHECK-LABEL: @test_vfmsd_lane_f64(
 // CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
@@ -471,7 +471,7 @@ float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
   return vfmsd_lane_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmss_laneq_f32(
 // CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
@@ -482,7 +482,7 @@ float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
   return vfmss_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vfmsd_laneq_f64(
 // CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
@@ -493,991 +493,823 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
   return vfmsd_laneq_f64(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vmlal_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vmlal_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vmlal_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vmlal_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_lane_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vmlal_high_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_lane_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vmlal_high_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_laneq_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vmlal_high_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_laneq_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vmlal_high_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vmlsl_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vmlsl_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vmlsl_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vmlsl_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_lane_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vmlsl_high_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_lane_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vmlsl_high_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_laneq_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vmlsl_high_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_laneq_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vmlsl_high_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vmlal_lane_u16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vmlal_lane_u32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vmlal_laneq_u16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vmlal_laneq_u32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_lane_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vmlal_high_lane_u16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_lane_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vmlal_high_lane_u32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_laneq_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vmlal_high_laneq_u16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_laneq_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vmlal_high_laneq_u32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vmlsl_lane_u16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vmlsl_lane_u32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vmlsl_laneq_u16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vmlsl_laneq_u32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_lane_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vmlsl_high_lane_u16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_lane_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vmlsl_high_lane_u32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_laneq_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vmlsl_high_laneq_u16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_laneq_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vmlsl_high_laneq_u32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
   return vmull_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
   return vmull_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
   return vmull_lane_u16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
   return vmull_lane_u32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_lane_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
   return vmull_high_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_lane_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
   return vmull_high_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_lane_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
   return vmull_high_lane_u16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_lane_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
   return vmull_high_lane_u32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
   return vmull_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
   return vmull_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
   return vmull_laneq_u16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
   return vmull_laneq_u32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_laneq_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
   return vmull_high_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_laneq_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
   return vmull_high_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_laneq_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
   return vmull_high_laneq_u16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_laneq_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
   return vmull_high_laneq_u32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vqdmlal_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vqdmlal_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_lane_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vqdmlal_high_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_lane_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vqdmlal_high_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vqdmlsl_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vqdmlsl_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vqdmlsl_high_lane_s16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vqdmlsl_high_lane_s32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
   return vqdmull_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
   return vqdmull_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
   return vqdmull_laneq_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
   return vqdmull_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_high_lane_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
   return vqdmull_high_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_high_lane_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
   return vqdmull_high_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_high_laneq_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
   return vqdmull_high_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_high_laneq_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
   return vqdmull_high_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmulh_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
   return vqdmulh_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmulhq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
   return vqdmulhq_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmulh_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
   return vqdmulh_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmulhq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
   return vqdmulhq_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulh_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
   return vqrdmulh_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
   return vqrdmulhq_lane_s16(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulh_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
   return vqrdmulh_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
   return vqrdmulhq_lane_s32(a, v, 1);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x float> [[MUL]]
@@ -1485,8 +1317,7 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
   return vmul_lane_f32(a, v, 1);
 }
 
-
-// CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
@@ -1495,20 +1326,21 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
 // CHECK:   ret <1 x double> [[TMP5]]
+
 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
   return vmul_lane_f64(a, v, 0);
 }
 
-
-// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x float> [[MUL]]
+
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
   return vmulq_lane_f32(a, v, 1);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_f64(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x double> [[MUL]]
@@ -1516,7 +1348,7 @@ float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
   return vmulq_lane_f64(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x float> [[MUL]]
@@ -1524,7 +1356,7 @@ float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
   return vmul_laneq_f32(a, v, 3);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
@@ -1537,16 +1369,16 @@ float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
   return vmul_laneq_f64(a, v, 1);
 }
 
-
-// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x float> [[MUL]]
+
 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
   return vmulq_laneq_f32(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_f64(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x double> [[MUL]]
@@ -1554,79 +1386,67 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
   return vmulq_laneq_f64(a, v, 1);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulx_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
   return vmulx_lane_f32(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulxq_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
   return vmulxq_lane_f32(a, v, 1);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK-LABEL: @test_vmulxq_lane_f64(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
   return vmulxq_lane_f64(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulx_laneq_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
   return vmulx_laneq_f32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulxq_laneq_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
   return vmulxq_laneq_f32(a, v, 3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vmulxq_laneq_f64(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
   return vmulxq_laneq_f64(a, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmla_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -1635,7 +1455,7 @@ int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
   return vmla_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -1644,7 +1464,7 @@ int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
   return vmlaq_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmla_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -1653,7 +1473,7 @@ int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
   return vmla_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -1662,7 +1482,7 @@ int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
   return vmlaq_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmla_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -1671,7 +1491,7 @@ int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
   return vmla_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -1680,7 +1500,7 @@ int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
   return vmlaq_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmla_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -1689,7 +1509,7 @@ int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
   return vmla_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -1698,7 +1518,7 @@ int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
   return vmlaq_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmls_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -1707,7 +1527,7 @@ int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
   return vmls_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -1716,7 +1536,7 @@ int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
   return vmlsq_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmls_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -1725,7 +1545,7 @@ int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
   return vmls_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -1734,7 +1554,7 @@ int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
   return vmlsq_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmls_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -1743,7 +1563,7 @@ int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
   return vmls_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -1752,7 +1572,7 @@ int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
   return vmlsq_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmls_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -1761,7 +1581,7 @@ int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
   return vmls_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -1770,7 +1590,7 @@ int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
   return vmlsq_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -1778,7 +1598,7 @@ int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vmul_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -1786,7 +1606,7 @@ int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vmulq_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -1794,7 +1614,7 @@ int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vmul_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -1802,7 +1622,7 @@ int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vmulq_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -1810,7 +1630,7 @@ uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
   return vmul_lane_u16(a, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -1818,7 +1638,7 @@ uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
   return vmulq_lane_u16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -1826,7 +1646,7 @@ uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
   return vmul_lane_u32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -1834,7 +1654,7 @@ uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
   return vmulq_lane_u32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -1842,7 +1662,7 @@ int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vmul_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -1850,7 +1670,7 @@ int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vmulq_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -1858,7 +1678,7 @@ int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vmul_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -1866,7 +1686,7 @@ int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vmulq_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -1874,7 +1694,7 @@ uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
   return vmul_laneq_u16(a, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -1882,7 +1702,7 @@ uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
   return vmulq_laneq_u16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -1890,7 +1710,7 @@ uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
   return vmul_laneq_u32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -1898,7 +1718,7 @@ uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
   return vmulq_laneq_u32(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vfma_lane_f32_0(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
@@ -1912,7 +1732,7 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vfma_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmaq_lane_f32_0(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
@@ -1926,7 +1746,7 @@ float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vfmaq_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfma_laneq_f32_0(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
@@ -1940,7 +1760,7 @@ float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vfma_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmaq_laneq_f32_0(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
@@ -1954,7 +1774,7 @@ float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v)
   return vfmaq_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vfms_lane_f32_0(
 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
@@ -1969,7 +1789,7 @@ float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vfms_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmsq_lane_f32_0(
 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
@@ -1984,7 +1804,7 @@ float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vfmsq_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfms_laneq_f32_0(
 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
@@ -1999,7 +1819,7 @@ float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vfms_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vfmsq_laneq_f32_0(
 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
@@ -2014,7 +1834,7 @@ float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v)
   return vfmsq_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vfmaq_laneq_f64_0(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
@@ -2028,7 +1848,7 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v)
   return vfmaq_laneq_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vfmsq_laneq_f64_0(
 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
@@ -2043,991 +1863,821 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v)
   return vfmsq_laneq_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vmlal_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vmlal_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vmlal_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vmlal_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_lane_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vmlal_high_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_lane_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vmlal_high_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vmlal_high_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vmlal_high_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vmlsl_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vmlsl_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vmlsl_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vmlsl_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vmlsl_high_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vmlsl_high_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vmlsl_high_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vmlsl_high_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vmlal_lane_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vmlal_lane_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vmlal_laneq_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vmlal_laneq_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_lane_u16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vmlal_high_lane_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_lane_u32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vmlal_high_lane_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vmlal_high_laneq_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vmlal_high_laneq_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vmlsl_lane_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vmlsl_lane_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vmlsl_laneq_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vmlsl_laneq_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vmlsl_high_lane_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vmlsl_high_lane_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vmlsl_high_laneq_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vmlsl_high_laneq_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vmull_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vmull_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
   return vmull_lane_u16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
   return vmull_lane_u32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_lane_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vmull_high_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_lane_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vmull_high_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_lane_u16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
   return vmull_high_lane_u16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_lane_u32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
   return vmull_high_lane_u32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vmull_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vmull_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
   return vmull_laneq_u16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
   return vmull_laneq_u32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_laneq_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vmull_high_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_laneq_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vmull_high_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_laneq_u16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
   return vmull_high_laneq_u16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmull_high_laneq_u32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
   return vmull_high_laneq_u32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vqdmlal_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vqdmlal_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vqdmlal_high_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vqdmlal_high_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   return vqdmlsl_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   return vqdmlsl_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   return vqdmlsl_high_lane_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   return vqdmlsl_high_lane_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vqdmull_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vqdmull_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vqdmull_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vqdmull_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vqdmull_high_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vqdmull_high_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vqdmull_high_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vqdmull_high_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmulh_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vqdmulh_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vqdmulhq_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmulh_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vqdmulh_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vqdmulhq_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vqrdmulh_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
-// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vqrdmulhq_lane_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vqrdmulh_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
-// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vqrdmulhq_lane_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vmul_lane_f32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x float> [[MUL]]
@@ -3035,7 +2685,7 @@ float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
   return vmul_lane_f32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulq_lane_f32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x float> [[MUL]]
@@ -3043,7 +2693,7 @@ float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
   return vmulq_lane_f32(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_f32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x float> [[MUL]]
@@ -3051,7 +2701,7 @@ float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
   return vmul_laneq_f32(a, v, 0);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vmul_laneq_f64_0(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
@@ -3064,7 +2714,7 @@ float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
   return vmul_laneq_f64(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_f32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x float> [[MUL]]
@@ -3072,7 +2722,7 @@ float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
   return vmulq_laneq_f32(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vmulq_laneq_f64_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x double> [[MUL]]
@@ -3080,79 +2730,67 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
   return vmulq_laneq_f64(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulx_lane_f32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
   return vmulx_lane_f32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulxq_lane_f32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
   return vmulxq_lane_f32(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK-LABEL: @test_vmulxq_lane_f64_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
   return vmulxq_lane_f64(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulx_laneq_f32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
   return vmulx_laneq_f32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK-LABEL: @test_vmulxq_laneq_f32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
 // CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
   return vmulxq_laneq_f32(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK-LABEL: @test_vmulxq_laneq_f64_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
 // CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
   return vmulxq_laneq_f64(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vmull_high_n_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
@@ -3160,29 +2798,25 @@ float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
 // CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
   return vmull_high_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmull_high_n_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
 // CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
   return vmull_high_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vmull_high_n_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
@@ -3190,29 +2824,25 @@ int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
 // CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
   return vmull_high_n_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmull_high_n_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
 // CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
   return vmull_high_n_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqdmull_high_n_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
@@ -3220,33 +2850,27 @@ uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2
+// CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
 // CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V5_I_I]]
 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
   return vqdmull_high_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmull_high_n_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
 // CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V3_I_I]]
 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
   return vqdmull_high_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlal_high_n_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
@@ -3254,31 +2878,27 @@ int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   return vmlal_high_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlal_high_n_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   return vmlal_high_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlal_high_n_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
@@ -3286,31 +2906,27 @@ int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
   return vmlal_high_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlal_high_n_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
   return vmlal_high_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_n_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
@@ -3319,34 +2935,28 @@ uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
-// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   return vqdmlal_high_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_n_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
-// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   return vqdmlal_high_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_high_n_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
@@ -3354,31 +2964,27 @@ int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   return vmlsl_high_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_high_n_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   return vmlsl_high_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_high_n_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
@@ -3386,31 +2992,27 @@ int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
   return vmlsl_high_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_high_n_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
   return vmlsl_high_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_n_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
@@ -3419,34 +3021,28 @@ uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
-// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   return vqdmlsl_high_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_n_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
-// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   return vqdmlsl_high_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
+// CHECK-LABEL: @test_vmul_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
@@ -3455,7 +3051,7 @@ float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   return vmul_n_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
@@ -3466,7 +3062,7 @@ float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   return vmulq_n_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_f64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
@@ -3475,22 +3071,19 @@ float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
   return vmulq_n_f64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK-LABEL: @test_vfma_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
-// CHECK:   ret <2 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> [[VECINIT1_I]], <2 x float> %a) #2
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   return vfma_n_f32(a, b, n);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK-LABEL: @test_vfmaq_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
@@ -3498,32 +3091,26 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
-// CHECK:   ret <4 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> [[VECINIT3_I]], <4 x float> %a) #2
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
   return vfmaq_n_f32(a, b, n);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK-LABEL: @test_vfms_n_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
-// CHECK:   ret <2 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> [[VECINIT1_I]], <2 x float> %a) #2
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   return vfms_n_f32(a, b, n);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK-LABEL: @test_vfmsq_n_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
@@ -3532,16 +3119,13 @@ float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
-// CHECK:   ret <4 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> [[VECINIT3_I]], <4 x float> %a) #2
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
   return vfmsq_n_f32(a, b, n);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vmul_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -3552,7 +3136,7 @@ int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   return vmul_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -3567,7 +3151,7 @@ int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   return vmulq_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmul_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
@@ -3576,7 +3160,7 @@ int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   return vmul_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
@@ -3587,7 +3171,7 @@ int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   return vmulq_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vmul_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -3598,7 +3182,7 @@ uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   return vmul_n_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -3613,7 +3197,7 @@ uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   return vmulq_n_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmul_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
@@ -3622,7 +3206,7 @@ uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   return vmul_n_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
@@ -3633,112 +3217,95 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   return vmulq_n_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vmull_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   return vmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmull_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   return vmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vmull_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   return vmull_n_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmull_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   return vmull_n_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqdmull_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V5_I]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   return vqdmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmull_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V3_I]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   return vqdmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V5_I]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
@@ -3749,66 +3316,54 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #2
 // CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V9_I]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V3_I]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #2
 // CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V5_I]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqdmulhq_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V5_I]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqrdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
@@ -3819,49 +3374,40 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #2
 // CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V9_I]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqrdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V3_I]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqrdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #2
 // CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V5_I]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqrdmulhq_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmla_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -3873,7 +3419,7 @@ int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmla_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -3889,7 +3435,7 @@ int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlaq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmla_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
@@ -3899,7 +3445,7 @@ int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmla_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
@@ -3911,7 +3457,7 @@ int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlaq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmla_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -3923,7 +3469,7 @@ uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmla_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -3939,7 +3485,7 @@ uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlaq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmla_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
@@ -3949,7 +3495,7 @@ uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmla_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
@@ -3961,67 +3507,59 @@ uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlaq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlal_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlal_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlal_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlal_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlal_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlal_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -4029,33 +3567,27 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmls_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -4067,7 +3599,7 @@ int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmls_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -4083,7 +3615,7 @@ int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlsq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmls_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
@@ -4093,7 +3625,7 @@ int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmls_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
@@ -4105,7 +3637,7 @@ int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlsq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmls_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -4117,7 +3649,7 @@ uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmls_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -4133,7 +3665,7 @@ uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlsq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmls_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
@@ -4143,7 +3675,7 @@ uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmls_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
@@ -4155,67 +3687,59 @@ uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlsq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlsl_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlsl_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -4223,33 +3747,27 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmla_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -4258,7 +3776,7 @@ uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
   return vmla_lane_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -4267,7 +3785,7 @@ uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
   return vmlaq_lane_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmla_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -4276,7 +3794,7 @@ uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
   return vmla_lane_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -4285,7 +3803,7 @@ uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
   return vmlaq_lane_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmla_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -4294,7 +3812,7 @@ uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
   return vmla_laneq_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -4303,7 +3821,7 @@ uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
   return vmlaq_laneq_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmla_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -4312,7 +3830,7 @@ uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
   return vmla_laneq_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -4321,69 +3839,57 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
   return vmlaq_laneq_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vqdmlal_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vqdmlal_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vqdmlal_high_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vqdmlal_high_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmls_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -4392,7 +3898,7 @@ uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
   return vmls_lane_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -4401,7 +3907,7 @@ uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
   return vmlsq_lane_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmls_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -4410,7 +3916,7 @@ uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
   return vmls_lane_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -4419,7 +3925,7 @@ uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
   return vmlsq_lane_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmls_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -4428,7 +3934,7 @@ uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
   return vmls_laneq_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_laneq_u16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -4437,7 +3943,7 @@ uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
   return vmlsq_laneq_u16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmls_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -4446,7 +3952,7 @@ uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
   return vmls_laneq_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_laneq_u32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -4455,181 +3961,145 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
   return vmlsq_laneq_u32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vqdmlsl_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vqdmlsl_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vqdmlsl_high_laneq_s16(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vqdmlsl_high_laneq_s32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vqdmulh_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vqdmulhq_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vqdmulh_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vqdmulhq_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vqrdmulh_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vqrdmulhq_laneq_s16(a, v, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vqrdmulh_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vqrdmulhq_laneq_s32(a, v, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmla_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -4638,7 +4108,7 @@ uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
   return vmla_lane_u16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -4647,7 +4117,7 @@ uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
   return vmlaq_lane_u16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmla_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -4656,7 +4126,7 @@ uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
   return vmla_lane_u32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -4665,7 +4135,7 @@ uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
   return vmlaq_lane_u32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmla_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -4674,7 +4144,7 @@ uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
   return vmla_laneq_u16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -4683,7 +4153,7 @@ uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
   return vmlaq_laneq_u16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmla_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -4692,7 +4162,7 @@ uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
   return vmla_laneq_u32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlaq_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -4701,69 +4171,57 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
   return vmlaq_laneq_u32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vqdmlal_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vqdmlal_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vqdmlal_high_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vqdmlal_high_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmls_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -4772,7 +4230,7 @@ uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
   return vmls_lane_u16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -4781,7 +4239,7 @@ uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
   return vmlsq_lane_u16(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmls_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -4790,7 +4248,7 @@ uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
   return vmls_lane_u32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -4799,7 +4257,7 @@ uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
   return vmlsq_lane_u32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmls_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -4808,7 +4266,7 @@ uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
   return vmls_laneq_u16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_laneq_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -4817,7 +4275,7 @@ uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
   return vmlsq_laneq_u16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmls_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -4826,7 +4284,7 @@ uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
   return vmls_laneq_u32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vmlsq_laneq_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -4835,177 +4293,140 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
   return vmlsq_laneq_u32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
   return vqdmlsl_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
   return vqdmlsl_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
   return vqdmlsl_high_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
   return vqdmlsl_high_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmulh_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
   return vqdmulh_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqdmulhq_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
   return vqdmulhq_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmulh_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
   return vqdmulh_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqdmulhq_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
   return vqdmulhq_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulh_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
   return vqrdmulh_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
   return vqrdmulhq_laneq_s16(a, v, 7);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulh_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
   return vqrdmulh_laneq_s32(a, v, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
   return vqrdmulhq_laneq_s32(a, v, 3);
 }
-
diff --git a/test/CodeGen/aarch64-neon-across.c b/test/CodeGen/aarch64-neon-across.c
index 04a7b26e8a27..6d7a0d5bcde4 100644
--- a/test/CodeGen/aarch64-neon-across.c
+++ b/test/CodeGen/aarch64-neon-across.c
@@ -14,9 +14,7 @@ int16_t test_vaddlv_s8(int8x8_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vaddlv_s16(<4 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a) #2
 // CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlv_s16(int16x4_t a) {
   return vaddlv_s16(a);
@@ -31,9 +29,7 @@ uint16_t test_vaddlv_u8(uint8x8_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vaddlv_u16(<4 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a) #2
 // CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlv_u16(uint16x4_t a) {
   return vaddlv_u16(a);
@@ -48,18 +44,14 @@ int16_t test_vaddlvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vaddlvq_s16(<8 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a) #2
 // CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlvq_s16(int16x8_t a) {
   return vaddlvq_s16(a);
 }
 
 // CHECK-LABEL: define i64 @test_vaddlvq_s32(<4 x i32> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a) #2
 // CHECK:   ret i64 [[VADDLVQ_S32_I]]
 int64_t test_vaddlvq_s32(int32x4_t a) {
   return vaddlvq_s32(a);
@@ -74,18 +66,14 @@ uint16_t test_vaddlvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vaddlvq_u16(<8 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a) #2
 // CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlvq_u16(uint16x8_t a) {
   return vaddlvq_u16(a);
 }
 
 // CHECK-LABEL: define i64 @test_vaddlvq_u32(<4 x i32> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a) #2
 // CHECK:   ret i64 [[VADDLVQ_U32_I]]
 uint64_t test_vaddlvq_u32(uint32x4_t a) {
   return vaddlvq_u32(a);
@@ -100,9 +88,7 @@ int8_t test_vmaxv_s8(int8x8_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vmaxv_s16(<4 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxv_s16(int16x4_t a) {
@@ -118,9 +104,7 @@ uint8_t test_vmaxv_u8(uint8x8_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vmaxv_u16(<4 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxv_u16(uint16x4_t a) {
@@ -136,9 +120,7 @@ int8_t test_vmaxvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vmaxvq_s16(<8 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxvq_s16(int16x8_t a) {
@@ -146,9 +128,7 @@ int16_t test_vmaxvq_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vmaxvq_s32(<4 x i32> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a) #2
 // CHECK:   ret i32 [[VMAXVQ_S32_I]]
 int32_t test_vmaxvq_s32(int32x4_t a) {
   return vmaxvq_s32(a);
@@ -163,9 +143,7 @@ uint8_t test_vmaxvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vmaxvq_u16(<8 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxvq_u16(uint16x8_t a) {
@@ -173,9 +151,7 @@ uint16_t test_vmaxvq_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vmaxvq_u32(<4 x i32> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a) #2
 // CHECK:   ret i32 [[VMAXVQ_U32_I]]
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
   return vmaxvq_u32(a);
@@ -190,9 +166,7 @@ int8_t test_vminv_s8(int8x8_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vminv_s16(<4 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 int16_t test_vminv_s16(int16x4_t a) {
@@ -208,9 +182,7 @@ uint8_t test_vminv_u8(uint8x8_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vminv_u16(<4 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminv_u16(uint16x4_t a) {
@@ -226,9 +198,7 @@ int8_t test_vminvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vminvq_s16(<8 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 int16_t test_vminvq_s16(int16x8_t a) {
@@ -236,9 +206,7 @@ int16_t test_vminvq_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vminvq_s32(<4 x i32> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a) #2
 // CHECK:   ret i32 [[VMINVQ_S32_I]]
 int32_t test_vminvq_s32(int32x4_t a) {
   return vminvq_s32(a);
@@ -253,9 +221,7 @@ uint8_t test_vminvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vminvq_u16(<8 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminvq_u16(uint16x8_t a) {
@@ -263,9 +229,7 @@ uint16_t test_vminvq_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vminvq_u32(<4 x i32> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a) #2
 // CHECK:   ret i32 [[VMINVQ_U32_I]]
 uint32_t test_vminvq_u32(uint32x4_t a) {
   return vminvq_u32(a);
@@ -280,9 +244,7 @@ int8_t test_vaddv_s8(int8x8_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vaddv_s16(<4 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddv_s16(int16x4_t a) {
@@ -298,9 +260,7 @@ uint8_t test_vaddv_u8(uint8x8_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vaddv_u16(<4 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddv_u16(uint16x4_t a) {
@@ -316,9 +276,7 @@ int8_t test_vaddvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vaddvq_s16(<8 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddvq_s16(int16x8_t a) {
@@ -326,9 +284,7 @@ int16_t test_vaddvq_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vaddvq_s32(<4 x i32> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a) #2
 // CHECK:   ret i32 [[VADDVQ_S32_I]]
 int32_t test_vaddvq_s32(int32x4_t a) {
   return vaddvq_s32(a);
@@ -343,9 +299,7 @@ uint8_t test_vaddvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define i16 @test_vaddvq_u16(<8 x i16> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a) #2
 // CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
 // CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddvq_u16(uint16x8_t a) {
@@ -353,45 +307,35 @@ uint16_t test_vaddvq_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define i32 @test_vaddvq_u32(<4 x i32> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a) #2
 // CHECK:   ret i32 [[VADDVQ_U32_I]]
 uint32_t test_vaddvq_u32(uint32x4_t a) {
   return vaddvq_u32(a);
 }
 
 // CHECK-LABEL: define float @test_vmaxvq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a) #2
 // CHECK:   ret float [[VMAXVQ_F32_I]]
 float32_t test_vmaxvq_f32(float32x4_t a) {
   return vmaxvq_f32(a);
 }
 
 // CHECK-LABEL: define float @test_vminvq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a) #2
 // CHECK:   ret float [[VMINVQ_F32_I]]
 float32_t test_vminvq_f32(float32x4_t a) {
   return vminvq_f32(a);
 }
 
 // CHECK-LABEL: define float @test_vmaxnmvq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a) #2
 // CHECK:   ret float [[VMAXNMVQ_F32_I]]
 float32_t test_vmaxnmvq_f32(float32x4_t a) {
   return vmaxnmvq_f32(a);
 }
 
 // CHECK-LABEL: define float @test_vminnmvq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a) #2
 // CHECK:   ret float [[VMINNMVQ_F32_I]]
 float32_t test_vminnmvq_f32(float32x4_t a) {
   return vminnmvq_f32(a);
diff --git a/test/CodeGen/aarch64-neon-fma.c b/test/CodeGen/aarch64-neon-fma.c
index 836321af0609..6ada533c66ee 100644
--- a/test/CodeGen/aarch64-neon-fma.c
+++ b/test/CodeGen/aarch64-neon-fma.c
@@ -214,13 +214,7 @@ float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 // CHECK-LABEL: define <2 x double> @test_vfmaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #2
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> [[VECINIT1_I]], <2 x double> %a)
 // CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
   return vfmaq_n_f64(a, b, c);
@@ -230,13 +224,7 @@ float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #2
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> [[VECINIT1_I]], <2 x double> %a) #2
 // CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
   return vfmsq_n_f64(a, b, c);
diff --git a/test/CodeGen/aarch64-neon-intrinsics.c b/test/CodeGen/aarch64-neon-intrinsics.c
index b087ce91e568..2ffbcdce372c 100644
--- a/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
@@ -7,396 +7,392 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vadd_s8(
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
 // CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) {
   return vadd_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vadd_s16(
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
 // CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) {
   return vadd_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vadd_s32(
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
 // CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) {
   return vadd_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vadd_s64(
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
 // CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) {
   return vadd_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vadd_f32(
 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2
 // CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) {
   return vadd_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vadd_u8(
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
 // CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) {
   return vadd_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vadd_u16(
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
 // CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) {
   return vadd_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vadd_u32(
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
 // CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) {
   return vadd_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vadd_u64(
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
 // CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) {
   return vadd_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_s8(
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
 // CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) {
   return vaddq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_s16(
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
 // CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) {
   return vaddq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_s32(
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
 // CHECK:   ret <4 x i32> [[ADD_I]]
-int32x4_t test_vaddq_s32(int32x4_t v1,int32x4_t  v2) {
+int32x4_t test_vaddq_s32(int32x4_t v1, int32x4_t v2) {
   return vaddq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_s64(
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) {
   return vaddq_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_f32(
 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2
 // CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) {
   return vaddq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vaddq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_f64(
 // CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2
 // CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) {
   return vaddq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_u8(
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
 // CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vaddq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_u16(
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
 // CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vaddq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_u32(
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vaddq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vaddq_u64(
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vaddq_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vsub_s8(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
 // CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) {
   return vsub_s8(v1, v2);
 }
-// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+
+// CHECK-LABEL: @test_vsub_s16(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
 // CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) {
   return vsub_s16(v1, v2);
 }
-// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+
+// CHECK-LABEL: @test_vsub_s32(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
 // CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) {
   return vsub_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vsub_s64(
 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
 // CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) {
   return vsub_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vsub_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2
 // CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) {
   return vsub_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vsub_u8(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
 // CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) {
   return vsub_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vsub_u16(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
 // CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) {
   return vsub_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vsub_u32(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
 // CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) {
   return vsub_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vsub_u64(
 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
 // CHECK:   ret <1 x i64> [[SUB_I]]
 uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) {
   return vsub_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_s8(
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
 // CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) {
   return vsubq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_s16(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
 // CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) {
   return vsubq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_s32(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
 // CHECK:   ret <4 x i32> [[SUB_I]]
-int32x4_t test_vsubq_s32(int32x4_t v1,int32x4_t  v2) {
+int32x4_t test_vsubq_s32(int32x4_t v1, int32x4_t v2) {
   return vsubq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_s64(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) {
   return vsubq_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2
 // CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) {
   return vsubq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vsubq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_f64(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2
 // CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) {
   return vsubq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_u8(
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
 // CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vsubq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_u16(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
 // CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vsubq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_u32(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vsubq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vsubq_u64(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vsubq_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vmul_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
 // CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) {
   return vmul_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vmul_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
 // CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) {
   return vmul_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vmul_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
 // CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) {
   return vmul_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vmul_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2
 // CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) {
   return vmul_f32(v1, v2);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vmul_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
 // CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) {
   return vmul_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vmul_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
 // CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) {
   return vmul_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vmul_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
 // CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) {
   return vmul_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vmulq_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
 // CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) {
   return vmulq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vmulq_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
 // CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) {
   return vmulq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vmulq_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
 // CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) {
   return vmulq_s32(v1, v2);
 }
-    
-// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+
+// CHECK-LABEL: @test_vmulq_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
 // CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vmulq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vmulq_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
 // CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vmulq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vmulq_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
 // CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vmulq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vmulq_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2
 // CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) {
   return vmulq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vmulq_f64(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2
 // CHECK:   ret <2 x double> [[MUL_I]]
 float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) {
   return vmulq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vmul_p8(
 // CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) {
-  //  test_vmul_p8
   return vmul_p8(v1, v2);
-  //  pmul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vmulq_p8(
 // CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) {
-  // test_vmulq_p8
   return vmulq_p8(v1, v2);
-  // pmul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vmla_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
 // CHECK:   ret <8 x i8> [[ADD_I]]
@@ -404,7 +400,7 @@ int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
   return vmla_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmla_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vmla_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8>
@@ -413,7 +409,7 @@ int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
   return vmla_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vmla_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
 // CHECK:   ret <2 x i32> [[ADD_I]]
@@ -421,7 +417,7 @@ int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
   return vmla_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK-LABEL: @test_vmla_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]]
 // CHECK:   ret <2 x float> [[ADD_I]]
@@ -429,7 +425,7 @@ float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vmla_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vmla_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
 // CHECK:   ret <8 x i8> [[ADD_I]]
@@ -437,7 +433,7 @@ uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   return vmla_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vmla_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
 // CHECK:   ret <4 x i16> [[ADD_I]]
@@ -445,7 +441,7 @@ uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   return vmla_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vmla_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
 // CHECK:   ret <2 x i32> [[ADD_I]]
@@ -453,7 +449,7 @@ uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   return vmla_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vmlaq_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
 // CHECK:   ret <16 x i8> [[ADD_I]]
@@ -461,7 +457,7 @@ int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
   return vmlaq_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vmlaq_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -469,15 +465,15 @@ int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
   return vmlaq_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vmlaq_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vmlaq_s32(v1, v2, v3);
-} 
+}
 
-// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK-LABEL: @test_vmlaq_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]]
 // CHECK:   ret <4 x float> [[ADD_I]]
@@ -485,7 +481,7 @@ float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vmlaq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vmlaq_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
 // CHECK:   ret <16 x i8> [[ADD_I]]
@@ -493,7 +489,7 @@ uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   return vmlaq_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vmlaq_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -501,7 +497,7 @@ uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   return vmlaq_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vmlaq_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
@@ -509,7 +505,7 @@ uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
   return vmlaq_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmlaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK-LABEL: @test_vmlaq_f64(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
 // CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]]
 // CHECK:   ret <2 x double> [[ADD_I]]
@@ -517,7 +513,7 @@ float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vmlaq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vmls_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
 // CHECK:   ret <8 x i8> [[SUB_I]]
@@ -525,7 +521,7 @@ int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
   return vmls_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmls_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vmls_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8>
@@ -534,7 +530,7 @@ int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
   return vmls_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vmls_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
 // CHECK:   ret <2 x i32> [[SUB_I]]
@@ -542,7 +538,7 @@ int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
   return vmls_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK-LABEL: @test_vmls_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]]
 // CHECK:   ret <2 x float> [[SUB_I]]
@@ -550,7 +546,7 @@ float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vmls_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vmls_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
 // CHECK:   ret <8 x i8> [[SUB_I]]
@@ -558,7 +554,7 @@ uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   return vmls_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vmls_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
 // CHECK:   ret <4 x i16> [[SUB_I]]
@@ -566,14 +562,15 @@ uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   return vmls_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vmls_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
 // CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   return vmls_u32(v1, v2, v3);
 }
-// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+
+// CHECK-LABEL: @test_vmlsq_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
 // CHECK:   ret <16 x i8> [[SUB_I]]
@@ -581,7 +578,7 @@ int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
   return vmlsq_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vmlsq_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -589,7 +586,7 @@ int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
   return vmlsq_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vmlsq_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
@@ -597,14 +594,15 @@ int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vmlsq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK-LABEL: @test_vmlsq_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]]
 // CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vmlsq_f32(v1, v2, v3);
 }
-// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+
+// CHECK-LABEL: @test_vmlsq_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
 // CHECK:   ret <16 x i8> [[SUB_I]]
@@ -612,7 +610,7 @@ uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   return vmlsq_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vmlsq_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -620,7 +618,7 @@ uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   return vmlsq_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vmlsq_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
@@ -628,115 +626,99 @@ uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
   return vmlsq_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmlsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK-LABEL: @test_vmlsq_f64(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]]
 // CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vmlsq_f64(v1, v2, v3);
 }
-// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+
+// CHECK-LABEL: @test_vfma_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
-// CHECK:   ret <2 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1) #4
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vfma_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK-LABEL: @test_vfmaq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
-// CHECK:   ret <4 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1) #4
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vfmaq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vfmaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK-LABEL: @test_vfmaq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #4
-// CHECK:   ret <2 x double> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1) #4
+// CHECK:   ret <2 x double> [[TMP3]]
 float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vfmaq_f64(v1, v2, v3);
 }
-// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+
+// CHECK-LABEL: @test_vfms_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v2
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
-// CHECK:   ret <2 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1) #4
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vfms_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK-LABEL: @test_vfmsq_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v2
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
-// CHECK:   ret <4 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1) #4
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vfmsq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vfmsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK-LABEL: @test_vfmsq_f64(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v2
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #4
-// CHECK:   ret <2 x double> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1) #4
+// CHECK:   ret <2 x double> [[TMP3]]
 float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vfmsq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vdivq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vdivq_f64(
 // CHECK:   [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2
 // CHECK:   ret <2 x double> [[DIV_I]]
 float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) {
   return vdivq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vdivq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vdivq_f32(
 // CHECK:   [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2
 // CHECK:   ret <4 x float> [[DIV_I]]
 float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) {
   return vdivq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vdiv_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vdiv_f32(
 // CHECK:   [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2
 // CHECK:   ret <2 x float> [[DIV_I]]
 float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
   return vdiv_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vaba_s8(
 // CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
 // CHECK:   ret <8 x i8> [[ADD_I]]
@@ -744,31 +726,27 @@ int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
   return vaba_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vaba_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v2, <4 x i16> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
 // CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
   return vaba_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vaba_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v2, <2 x i32> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
 // CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
   return vaba_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vaba_u8(
 // CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
 // CHECK:   ret <8 x i8> [[ADD_I]]
@@ -776,31 +754,27 @@ uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   return vaba_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vaba_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v2, <4 x i16> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
 // CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   return vaba_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vaba_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v2, <2 x i32> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
 // CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   return vaba_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vabaq_s8(
 // CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
 // CHECK:   ret <16 x i8> [[ADD_I]]
@@ -808,31 +782,27 @@ int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
   return vabaq_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vabaq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v2, <8 x i16> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
   return vabaq_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vabaq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v2, <4 x i32> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vabaq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vabaq_u8(
 // CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
 // CHECK:   ret <16 x i8> [[ADD_I]]
@@ -840,181 +810,154 @@ uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   return vabaq_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vabaq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v2, <8 x i16> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   return vabaq_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vabaq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v2, <4 x i32> %v3) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
   return vabaq_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vabd_s8(
 // CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VABD_I]]
 int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
   return vabd_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vabd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
 // CHECK:   ret <4 x i16> [[VABD2_I]]
 int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
   return vabd_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vabd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
 // CHECK:   ret <2 x i32> [[VABD2_I]]
 int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
   return vabd_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vabd_u8(
 // CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VABD_I]]
 uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
   return vabd_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vabd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
 // CHECK:   ret <4 x i16> [[VABD2_I]]
 uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
   return vabd_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vabd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
 // CHECK:   ret <2 x i32> [[VABD2_I]]
 uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
   return vabd_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vabd_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %v1, <2 x float> %v2) #4
 // CHECK:   ret <2 x float> [[VABD2_I]]
 float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
   return vabd_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vabdq_s8(
 // CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VABD_I]]
 int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
   return vabdq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vabdq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
 // CHECK:   ret <8 x i16> [[VABD2_I]]
 int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
   return vabdq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vabdq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
 // CHECK:   ret <4 x i32> [[VABD2_I]]
 int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
   return vabdq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vabdq_u8(
 // CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VABD_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vabdq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vabdq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
 // CHECK:   ret <8 x i16> [[VABD2_I]]
 uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vabdq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vabdq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
 // CHECK:   ret <4 x i32> [[VABD2_I]]
 uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vabdq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vabdq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %v1, <4 x float> %v2) #4
 // CHECK:   ret <4 x float> [[VABD2_I]]
 float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
   return vabdq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vabdq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vabdq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[VABD_I]], <2 x double> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %v1, <2 x double> %v2) #4
 // CHECK:   ret <2 x double> [[VABD2_I]]
 float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
   return vabdq_f64(v1, v2);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_s8(
 // CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
 // CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
@@ -1024,16 +967,13 @@ int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
   return vbsl_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP4]]
@@ -1041,39 +981,33 @@ int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
   return vbsl_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> %v1, <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <2 x i32> [[VBSL5_I]]
 int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) {
   return vbsl_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_s64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
   return vbsl_s64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_u8(
 // CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
 // CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
@@ -1083,64 +1017,54 @@ uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   return vbsl_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <4 x i16> [[VBSL5_I]]
 uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   return vbsl_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> %v1, <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <2 x i32> [[VBSL5_I]]
 uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   return vbsl_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
   return vbsl_u64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <2 x i32>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP4:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[TMP0]], [[VBSL1_I]]
+// CHECK:   [[TMP4:%.*]] = xor <2 x i32> [[TMP0]], <i32 -1, i32 -1>
 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]]
 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
@@ -1149,15 +1073,14 @@ float32x2_t test_vbsl_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vbsl_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, <i64 -1>
 // CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double>
@@ -1166,7 +1089,7 @@ float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) {
   return vbsl_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_p8(
 // CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
 // CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
@@ -1176,23 +1099,20 @@ poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) {
   return vbsl_p8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vbsl_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <4 x i16> [[VBSL5_I]]
 poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) {
   return vbsl_p16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_s8(
 // CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
 // CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
@@ -1202,55 +1122,46 @@ int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
   return vbslq_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <8 x i16> [[VBSL5_I]]
 int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
   return vbslq_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vbslq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <2 x i64> [[VBSL5_I]]
 int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) {
   return vbslq_s64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_u8(
 // CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
 // CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
@@ -1260,63 +1171,53 @@ uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   return vbslq_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <8 x i16> [[VBSL5_I]]
 uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   return vbslq_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vbslq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <2 x i64> [[VBSL5_I]]
 uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
   return vbslq_u64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
@@ -1325,7 +1226,7 @@ float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vbslq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_p8(
 // CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
 // CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
@@ -1335,31 +1236,27 @@ poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) {
   return vbslq_p8(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
 // CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <8 x i16> [[VBSL5_I]]
 poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) {
   return vbslq_p16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK-LABEL: @test_vbslq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, <i64 -1, i64 -1>
 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double>
@@ -1368,267 +1265,216 @@ float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vbslq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vrecps_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
-// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %v1, <2 x float> %v2) #4
+// CHECK:   ret <2 x float> [[VRECPS_V2_I]]
 float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) {
-   return vrecps_f32(v1, v2);
+  return vrecps_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vrecpsq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %v1, <4 x float> %v2) #4
 // CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   ret <4 x float> [[VRECPSQ_V2_I]]
 float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) {
-   return vrecpsq_f32(v1, v2);
+  return vrecpsq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrecpsq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vrecpsq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> [[VRECPSQ_V_I]], <2 x double> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %v1, <2 x double> %v2) #4
 // CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <2 x double>
-// CHECK:   ret <2 x double> [[TMP2]]
+// CHECK:   ret <2 x double> [[VRECPSQ_V2_I]]
 float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) {
   return vrecpsq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vrsqrts_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v1, <2 x float> %v2) #4
 // CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VRSQRTS_V2_I]]
 float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) {
   return vrsqrts_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vrsqrtsq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v1, <4 x float> %v2) #4
 // CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   ret <4 x float> [[VRSQRTSQ_V2_I]]
 float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) {
   return vrsqrtsq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrsqrtsq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vrsqrtsq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> [[VRSQRTSQ_V_I]], <2 x double> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %v1, <2 x double> %v2) #4
 // CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <2 x double>
-// CHECK:   ret <2 x double> [[TMP2]]
+// CHECK:   ret <2 x double> [[VRSQRTSQ_V2_I]]
 float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) {
   return vrsqrtsq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcage_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2) #4
 // CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) {
   return vcage_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vcage_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCAGE_V_I]], <1 x double> [[VCAGE_V1_I]]) #4
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   ret <1 x i64> [[VCAGE_V2_I]]
 uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) {
   return vcage_f64(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcageq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2) #4
 // CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) {
   return vcageq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcageq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vcageq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCAGEQ_V_I]], <2 x double> [[VCAGEQ_V1_I]]) #4
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2) #4
 // CHECK:   ret <2 x i64> [[VCAGEQ_V2_I]]
 uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) {
   return vcageq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcagt_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2) #4
 // CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) {
   return vcagt_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vcagt_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCAGT_V_I]], <1 x double> [[VCAGT_V1_I]]) #4
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   ret <1 x i64> [[VCAGT_V2_I]]
 uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) {
   return vcagt_f64(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcagtq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2) #4
 // CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) {
   return vcagtq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcagtq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vcagtq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCAGTQ_V_I]], <2 x double> [[VCAGTQ_V1_I]]) #4
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2) #4
 // CHECK:   ret <2 x i64> [[VCAGTQ_V2_I]]
 uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) {
   return vcagtq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcale_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1) #4
 // CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) {
   return vcale_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vcale_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCALE_V_I]], <1 x double> [[VCALE_V1_I]]) #4
+// CHECK:   [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCALE_V2_I]]
 uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) {
   return vcale_f64(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcaleq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1) #4
 // CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) {
   return vcaleq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcaleq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vcaleq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCALEQ_V_I]], <2 x double> [[VCALEQ_V1_I]]) #4
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1) #4
 // CHECK:   ret <2 x i64> [[VCALEQ_V2_I]]
 uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) {
   return vcaleq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcalt_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1) #4
 // CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) {
   return vcalt_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vcalt_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCALT_V_I]], <1 x double> [[VCALT_V1_I]]) #4
+// CHECK:   [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCALT_V2_I]]
 uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) {
   return vcalt_f64(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcaltq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1) #4
 // CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) {
   return vcaltq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcaltq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vcaltq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCALTQ_V_I]], <2 x double> [[VCALTQ_V1_I]]) #4
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1) #4
 // CHECK:   ret <2 x i64> [[VCALTQ_V2_I]]
 uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) {
   return vcaltq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vtst_s8(
 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
@@ -1637,33 +1483,29 @@ uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) {
   return vtst_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vtst_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) {
   return vtst_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vtst_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = and <2 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) {
   return vtst_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vtst_u8(
 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
@@ -1672,33 +1514,29 @@ uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) {
   return vtst_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vtst_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) {
   return vtst_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vtst_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = and <2 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) {
   return vtst_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_s8(
 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
@@ -1707,33 +1545,29 @@ uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) {
   return vtstq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) {
   return vtstq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = and <4 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) {
   return vtstq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_u8(
 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
@@ -1742,59 +1576,51 @@ uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vtstq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vtstq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = and <4 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vtstq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vtstq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = and <2 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) {
   return vtstq_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vtstq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = and <2 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vtstq_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vtst_p8(
 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
@@ -1803,20 +1629,18 @@ uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) {
   return vtst_p8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vtst_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) {
   return vtst_p16(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_p8(
 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
@@ -1825,46 +1649,40 @@ uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) {
   return vtstq_p8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vtstq_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) {
   return vtstq_p16(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vtst_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = and <1 x i64> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) {
   return vtst_s64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vtst_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = and <1 x i64> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) {
   return vtst_u64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vceq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1872,7 +1690,7 @@ uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) {
   return vceq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vceq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1880,7 +1698,7 @@ uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) {
   return vceq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vceq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1888,7 +1706,7 @@ uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) {
   return vceq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vceq_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -1896,7 +1714,7 @@ uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) {
   return vceq_s64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vceq_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -1904,7 +1722,7 @@ uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) {
   return vceq_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vceq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1912,7 +1730,7 @@ uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) {
   return vceq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vceq_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -1920,7 +1738,7 @@ uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) {
   return vceq_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vceq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1928,7 +1746,7 @@ uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) {
   return vceq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vceq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1936,7 +1754,7 @@ uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) {
   return vceq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vceq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1944,7 +1762,7 @@ uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) {
   return vceq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vceq_p8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1952,7 +1770,7 @@ uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) {
   return vceq_p8(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1960,7 +1778,7 @@ uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) {
   return vceqq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1968,7 +1786,7 @@ uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) {
   return vceqq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1976,7 +1794,7 @@ uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) {
   return vceqq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1984,7 +1802,7 @@ uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) {
   return vceqq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1992,7 +1810,7 @@ uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vceqq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2000,7 +1818,7 @@ uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vceqq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2008,7 +1826,7 @@ uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vceqq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_p8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2016,8 +1834,7 @@ uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) {
   return vceqq_p8(v1, v2);
 }
 
-
-// CHECK-LABEL: define <2 x i64> @test_vceqq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2025,7 +1842,7 @@ uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) {
   return vceqq_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vceqq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2033,14 +1850,15 @@ uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vceqq_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vceqq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vceqq_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) {
   return vceqq_f64(v1, v2);
 }
-// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+
+// CHECK-LABEL: @test_vcge_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -2048,7 +1866,7 @@ uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) {
   return vcge_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcge_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2056,7 +1874,7 @@ uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) {
   return vcge_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcge_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2064,7 +1882,7 @@ uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) {
   return vcge_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vcge_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2072,7 +1890,7 @@ uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) {
   return vcge_s64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vcge_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2080,7 +1898,7 @@ uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) {
   return vcge_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcge_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2088,7 +1906,7 @@ uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) {
   return vcge_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vcge_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2096,7 +1914,7 @@ uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) {
   return vcge_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcge_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -2104,7 +1922,7 @@ uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) {
   return vcge_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcge_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2112,7 +1930,7 @@ uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) {
   return vcge_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcge_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2120,7 +1938,7 @@ uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) {
   return vcge_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2128,7 +1946,7 @@ uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) {
   return vcgeq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2136,7 +1954,7 @@ uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) {
   return vcgeq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2144,7 +1962,7 @@ uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) {
   return vcgeq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2152,7 +1970,7 @@ uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) {
   return vcgeq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2160,7 +1978,7 @@ uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vcgeq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2168,7 +1986,7 @@ uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vcgeq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2176,7 +1994,7 @@ uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vcgeq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgeq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2184,7 +2002,7 @@ uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) {
   return vcgeq_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgeq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2192,7 +2010,7 @@ uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vcgeq_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgeq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vcgeq_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2200,18 +2018,18 @@ uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) {
   return vcgeq_f64(v1, v2);
 }
 
-// Notes about vcle:
-// LE condition predicate implemented as GE, so check reversed operands.
-// Using registers other than v0, v1 are possible, but would be odd.
-// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcle_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
+// Notes about vcle:
+// LE condition predicate implemented as GE, so check reversed operands.
+// Using registers other than v0, v1 are possible, but would be odd.
 uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) {
   return vcle_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcle_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2219,7 +2037,7 @@ uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) {
   return vcle_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcle_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2227,7 +2045,7 @@ uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) {
   return vcle_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vcle_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2235,7 +2053,7 @@ uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) {
   return vcle_s64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vcle_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2243,7 +2061,7 @@ uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) {
   return vcle_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcle_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2251,7 +2069,7 @@ uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) {
   return vcle_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vcle_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2259,7 +2077,7 @@ uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) {
   return vcle_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcle_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -2267,7 +2085,7 @@ uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) {
   return vcle_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcle_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2275,7 +2093,7 @@ uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) {
   return vcle_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcle_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2283,7 +2101,7 @@ uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) {
   return vcle_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2291,7 +2109,7 @@ uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) {
   return vcleq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2299,7 +2117,7 @@ uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) {
   return vcleq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2307,7 +2125,7 @@ uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) {
   return vcleq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2315,7 +2133,7 @@ uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) {
   return vcleq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2323,7 +2141,7 @@ uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vcleq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2331,7 +2149,7 @@ uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vcleq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2339,7 +2157,7 @@ uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vcleq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcleq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2347,7 +2165,7 @@ uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) {
   return vcleq_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcleq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2355,7 +2173,7 @@ uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vcleq_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcleq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vcleq_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2363,8 +2181,7 @@ uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) {
   return vcleq_f64(v1, v2);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcgt_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -2372,7 +2189,7 @@ uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) {
   return vcgt_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcgt_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2380,7 +2197,7 @@ uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) {
   return vcgt_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcgt_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2388,7 +2205,7 @@ uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) {
   return vcgt_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vcgt_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2396,7 +2213,7 @@ uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) {
   return vcgt_s64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vcgt_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2404,7 +2221,7 @@ uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) {
   return vcgt_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcgt_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2412,7 +2229,7 @@ uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) {
   return vcgt_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vcgt_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2420,7 +2237,7 @@ uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) {
   return vcgt_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcgt_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -2428,7 +2245,7 @@ uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) {
   return vcgt_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcgt_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2436,7 +2253,7 @@ uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) {
   return vcgt_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcgt_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2444,7 +2261,7 @@ uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) {
   return vcgt_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2452,7 +2269,7 @@ uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) {
   return vcgtq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2460,7 +2277,7 @@ uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) {
   return vcgtq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2468,7 +2285,7 @@ uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) {
   return vcgtq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2476,7 +2293,7 @@ uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) {
   return vcgtq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2484,7 +2301,7 @@ uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vcgtq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2492,7 +2309,7 @@ uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vcgtq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2500,7 +2317,7 @@ uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vcgtq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgtq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2508,7 +2325,7 @@ uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) {
   return vcgtq_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgtq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2516,7 +2333,7 @@ uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vcgtq_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgtq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vcgtq_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2524,20 +2341,18 @@ uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
   return vcgtq_f64(v1, v2);
 }
 
-
-// Notes about vclt:
-// LT condition predicate implemented as GT, so check reversed operands.
-// Using registers other than v0, v1 are possible, but would be odd.
-
-// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vclt_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
+// Notes about vclt:
+// LT condition predicate implemented as GT, so check reversed operands.
+// Using registers other than v0, v1 are possible, but would be odd.
 uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) {
   return vclt_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vclt_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2545,7 +2360,7 @@ uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) {
   return vclt_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vclt_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2553,7 +2368,7 @@ uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) {
   return vclt_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vclt_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2561,7 +2376,7 @@ uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) {
   return vclt_s64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vclt_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2569,7 +2384,7 @@ uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) {
   return vclt_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK-LABEL: @test_vclt_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2577,7 +2392,7 @@ uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) {
   return vclt_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vclt_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[SEXT_I]]
@@ -2585,7 +2400,7 @@ uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) {
   return vclt_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vclt_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -2593,7 +2408,7 @@ uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) {
   return vclt_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vclt_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2601,7 +2416,7 @@ uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) {
   return vclt_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vclt_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2609,7 +2424,7 @@ uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) {
   return vclt_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2617,7 +2432,7 @@ uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) {
   return vcltq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2625,7 +2440,7 @@ uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) {
   return vcltq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2633,7 +2448,7 @@ uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) {
   return vcltq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2641,7 +2456,7 @@ uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) {
   return vcltq_f32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2649,7 +2464,7 @@ uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vcltq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2657,7 +2472,7 @@ uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vcltq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2665,7 +2480,7 @@ uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vcltq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcltq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_s64(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2673,7 +2488,7 @@ uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) {
   return vcltq_s64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcltq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_u64(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2681,7 +2496,7 @@ uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vcltq_u64(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcltq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK-LABEL: @test_vcltq_f64(
 // CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[SEXT_I]]
@@ -2689,1513 +2504,1219 @@ uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) {
   return vcltq_f64(v1, v2);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vhadd_s8(
 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
   return vhadd_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vhadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
 int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
   return vhadd_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vhadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
 int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
   return vhadd_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vhadd_u8(
 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
   return vhadd_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vhadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
 uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
   return vhadd_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vhadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
 uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
   return vhadd_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vhaddq_s8(
 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
   return vhaddq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vhaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
 int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
   return vhaddq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vhaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
 int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
   return vhaddq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vhaddq_u8(
 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vhaddq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vhaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
 uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vhaddq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vhaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
 uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vhaddq_u32(v1, v2);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vhsub_s8(
 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) {
   return vhsub_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vhsub_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
 int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) {
   return vhsub_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vhsub_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
 int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) {
   return vhsub_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vhsub_u8(
 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) {
   return vhsub_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vhsub_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
 uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) {
   return vhsub_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vhsub_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
 uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) {
   return vhsub_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vhsubq_s8(
 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) {
   return vhsubq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vhsubq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
 int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) {
   return vhsubq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vhsubq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
 int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) {
   return vhsubq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vhsubq_u8(
 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vhsubq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vhsubq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
 uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vhsubq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vhsubq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
 uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vhsubq_u32(v1, v2);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vrhadd_s8(
 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
   return vrhadd_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vrhadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
 int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
   return vrhadd_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vrhadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
 int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
   return vrhadd_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vrhadd_u8(
 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
   return vrhadd_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vrhadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
 uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
   return vrhadd_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vrhadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
 uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
   return vrhadd_u32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vrhaddq_s8(
 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
   return vrhaddq_s8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vrhaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
 int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
   return vrhaddq_s16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vrhaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
 int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
   return vrhaddq_s32(v1, v2);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK-LABEL: @test_vrhaddq_u8(
 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vrhaddq_u8(v1, v2);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK-LABEL: @test_vrhaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vrhaddq_u16(v1, v2);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK-LABEL: @test_vrhaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vrhaddq_u32(v1, v2);
 }
-// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vqadd_s8(
 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   return vqadd_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   return vqadd_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqadd_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   return vqadd_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqadd_u8(
 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   return vqadd_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   return vqadd_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqadd_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   return vqadd_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_s8(
 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   return vqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   return vqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   return vqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_u8(
 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vqaddq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqsub_s8(
 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqsub_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   return vqsub_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqsub_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   return vqsub_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqsub_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   return vqsub_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqsub_u8(
 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqsub_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   return vqsub_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqsub_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   return vqsub_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqsub_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   return vqsub_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_s8(
 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   return vqsubq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   return vqsubq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   return vqsubq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_u8(
 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vqsubq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vqsubq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vqsubq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vshl_s8(
 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   return vshl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   return vshl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   return vshl_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   return vshl_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vshl_u8(
 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   return vshl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   return vshl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   return vshl_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   return vshl_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vshlq_s8(
 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   return vshlq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   return vshlq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   return vshlq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshlq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   return vshlq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vshlq_u8(
 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   return vshlq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   return vshlq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   return vshlq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshlq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   return vshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqshl_s8(
 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   return vqshl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   return vqshl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   return vqshl_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   return vqshl_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqshl_u8(
 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   return vqshl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   return vqshl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   return vqshl_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   return vqshl_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_s8(
 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   return vqshlq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   return vqshlq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   return vqshlq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   return vqshlq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_u8(
 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqshlq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqshlq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqshlq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqshlq_u64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrshl_s8(
 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   return vrshl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   return vrshl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   return vrshl_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   return vrshl_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrshl_u8(
 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   return vrshl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   return vrshl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   return vrshl_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   return vrshl_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_s8(
 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   return vrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   return vrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   return vrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   return vrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_u8(
 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vrshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_s8(
 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   return vqrshl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   return vqrshl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   return vqrshl_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   return vqrshl_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_u8(
 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   return vqrshl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   return vqrshl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return vqrshl_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   return vqrshl_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_s8(
 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   return vqrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   return vqrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   return vqrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   return vqrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_u8(
 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqrshlq_u64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -4203,10 +3724,10 @@ uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 0)
 // CHECK:   ret <1 x i64> [[VSLI_N2]]
 poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) {
-  return vsli_n_p64(a, b, 0); 
+  return vsli_n_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -4214,1054 +3735,878 @@ poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) {
 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 0)
 // CHECK:   ret <2 x i64> [[VSLI_N2]]
 poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) {
-  return vsliq_n_p64(a, b, 0); 
+  return vsliq_n_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmax_s8(
 // CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMAX_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   return vmax_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmax_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VMAX2_I]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   return vmax_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmax_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VMAX2_I]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   return vmax_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmax_u8(
 // CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMAX_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   return vmax_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmax_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VMAX2_I]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   return vmax_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmax_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VMAX2_I]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   return vmax_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmax_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VMAX2_I]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   return vmax_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_s8(
 // CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMAX_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   return vmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VMAX2_I]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   return vmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VMAX2_I]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   return vmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_u8(
 // CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMAX_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VMAX2_I]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VMAX2_I]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VMAX2_I]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   return vmaxq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmaxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> [[VMAX_I]], <2 x double> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VMAX2_I]]
 float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) {
   return vmaxq_f64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmin_s8(
 // CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMIN_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   return vmin_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmin_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VMIN2_I]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   return vmin_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmin_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VMIN2_I]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   return vmin_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmin_u8(
 // CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMIN_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   return vmin_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmin_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VMIN2_I]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   return vmin_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmin_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VMIN2_I]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   return vmin_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmin_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VMIN2_I]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   return vmin_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vminq_s8(
 // CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMIN_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   return vminq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vminq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VMIN2_I]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   return vminq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vminq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VMIN2_I]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   return vminq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vminq_u8(
 // CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMIN_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   return vminq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vminq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VMIN2_I]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   return vminq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vminq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VMIN2_I]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   return vminq_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vminq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VMIN2_I]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   return vminq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vminq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vminq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> [[VMIN_I]], <2 x double> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VMIN2_I]]
 float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) {
   return vminq_f64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmaxnm_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> [[VMAXNM_I]], <2 x float> [[VMAXNM1_I]]) #4
+// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VMAXNM2_I]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
   return vmaxnm_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vmaxnmq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> [[VMAXNM_I]], <4 x float> [[VMAXNM1_I]]) #4
+// CHECK:   [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VMAXNM2_I]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
   return vmaxnmq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vmaxnmq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> [[VMAXNM_I]], <2 x double> [[VMAXNM1_I]]) #4
+// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VMAXNM2_I]]
 float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) {
   return vmaxnmq_f64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vminnm_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> [[VMINNM_I]], <2 x float> [[VMINNM1_I]]) #4
+// CHECK:   [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VMINNM2_I]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
   return vminnm_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vminnmq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> [[VMINNM_I]], <4 x float> [[VMINNM1_I]]) #4
+// CHECK:   [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VMINNM2_I]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
   return vminnmq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vminnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vminnmq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> [[VMINNM_I]], <2 x double> [[VMINNM1_I]]) #4
+// CHECK:   [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VMINNM2_I]]
 float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) {
   return vminnmq_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmax_s8(
 // CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPMAX_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   return vpmax_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmax_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VPMAX2_I]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   return vpmax_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmax_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VPMAX2_I]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   return vpmax_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmax_u8(
 // CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPMAX_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   return vpmax_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmax_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VPMAX2_I]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   return vpmax_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmax_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VPMAX2_I]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   return vpmax_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vpmax_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VPMAX2_I]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   return vpmax_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vpmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmaxq_s8(
 // CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VPMAX_I]]
 int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) {
   return vpmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmaxq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VPMAX2_I]]
 int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) {
   return vpmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmaxq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VPMAX2_I]]
 int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) {
   return vpmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vpmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmaxq_u8(
 // CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VPMAX_I]]
 uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vpmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmaxq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VPMAX2_I]]
 uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vpmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmaxq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VPMAX2_I]]
 uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vpmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vpmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vpmaxq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> [[VPMAX_I]], <4 x float> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VPMAX2_I]]
 float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) {
   return vpmaxq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vpmaxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vpmaxq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> [[VPMAX_I]], <2 x double> [[VPMAX1_I]]) #4
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VPMAX2_I]]
 float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) {
   return vpmaxq_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmin_s8(
 // CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPMIN_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   return vpmin_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmin_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VPMIN2_I]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   return vpmin_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmin_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VPMIN2_I]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   return vpmin_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmin_u8(
 // CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPMIN_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   return vpmin_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmin_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VPMIN2_I]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   return vpmin_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmin_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VPMIN2_I]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   return vpmin_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vpmin_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VPMIN2_I]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   return vpmin_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vpminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpminq_s8(
 // CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VPMIN_I]]
 int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) {
   return vpminq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpminq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VPMIN2_I]]
 int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) {
   return vpminq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpminq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VPMIN2_I]]
 int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) {
   return vpminq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vpminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpminq_u8(
 // CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VPMIN_I]]
 uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) {
   return vpminq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpminq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VPMIN2_I]]
 uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) {
   return vpminq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpminq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VPMIN2_I]]
 uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) {
   return vpminq_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vpminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vpminq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> [[VPMIN_I]], <4 x float> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VPMIN2_I]]
 float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) {
   return vpminq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vpminq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vpminq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> [[VPMIN_I]], <2 x double> [[VPMIN1_I]]) #4
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VPMIN2_I]]
 float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) {
   return vpminq_f64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vpmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vpmaxnm_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> [[VPMAXNM_I]], <2 x float> [[VPMAXNM1_I]]) #4
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VPMAXNM2_I]]
 float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) {
   return vpmaxnm_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vpmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vpmaxnmq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> [[VPMAXNM_I]], <4 x float> [[VPMAXNM1_I]]) #4
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VPMAXNM2_I]]
 float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) {
   return vpmaxnmq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vpmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vpmaxnmq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> [[VPMAXNM_I]], <2 x double> [[VPMAXNM1_I]]) #4
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VPMAXNM2_I]]
 float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) {
   return vpmaxnmq_f64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vpminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vpminnm_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> [[VPMINNM_I]], <2 x float> [[VPMINNM1_I]]) #4
+// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VPMINNM2_I]]
 float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) {
   return vpminnm_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vpminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vpminnmq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> [[VPMINNM_I]], <4 x float> [[VPMINNM1_I]]) #4
+// CHECK:   [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VPMINNM2_I]]
 float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) {
   return vpminnmq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vpminnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vpminnmq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> [[VPMINNM_I]], <2 x double> [[VPMINNM1_I]]) #4
+// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VPMINNM2_I]]
 float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) {
   return vpminnmq_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadd_s8(
 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   return vpadd_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   return vpadd_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   return vpadd_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadd_u8(
 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   return vpadd_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   return vpadd_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   return vpadd_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vpadd_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VPADD_V2_I]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   return vpadd_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vpaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_s8(
 // CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
 int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
   return vpaddq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VPADDQ_V2_I]]
 int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
   return vpaddq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VPADDQ_V2_I]]
 int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
   return vpaddq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vpaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_u8(
 // CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
 uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vpaddq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VPADDQ_V2_I]]
 uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vpaddq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VPADDQ_V2_I]]
 uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vpaddq_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vpaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> [[VPADDQ_V_I]], <4 x float> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   ret <4 x float> [[VPADDQ_V2_I]]
 float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
   return vpaddq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vpaddq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> [[VPADDQ_V_I]], <2 x double> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x double>
-// CHECK:   ret <2 x double> [[TMP2]]
+// CHECK:   ret <2 x double> [[VPADDQ_V2_I]]
 float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
   return vpaddq_f64(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqdmulhq_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqrdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmulx_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmulx_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #4
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) {
   return vmulx_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vmulxq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #4
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) {
   return vmulxq_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vmulxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vmulxq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #4
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %b) #4
 // CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) {
   return vmulxq_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_s8(
 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_s8(int8x8_t a) {
   return vshl_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
@@ -5270,7 +4615,7 @@ int16x4_t test_vshl_n_s16(int16x4_t a) {
   return vshl_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
@@ -5279,14 +4624,14 @@ int32x2_t test_vshl_n_s32(int32x2_t a) {
   return vshl_n_s32(a, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_s8(
 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   return vshlq_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -5295,7 +4640,7 @@ int16x8_t test_vshlq_n_s16(int16x8_t a) {
   return vshlq_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
@@ -5304,7 +4649,7 @@ int32x4_t test_vshlq_n_s32(int32x4_t a) {
   return vshlq_n_s32(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
@@ -5313,14 +4658,14 @@ int64x2_t test_vshlq_n_s64(int64x2_t a) {
   return vshlq_n_s64(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_u8(
 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_u8(int8x8_t a) {
   return vshl_n_u8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
@@ -5329,7 +4674,7 @@ int16x4_t test_vshl_n_u16(int16x4_t a) {
   return vshl_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
@@ -5338,14 +4683,14 @@ int32x2_t test_vshl_n_u32(int32x2_t a) {
   return vshl_n_u32(a, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_u8(
 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_u8(int8x16_t a) {
   return vshlq_n_u8(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -5354,7 +4699,7 @@ int16x8_t test_vshlq_n_u16(int16x8_t a) {
   return vshlq_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
@@ -5363,7 +4708,7 @@ int32x4_t test_vshlq_n_u32(int32x4_t a) {
   return vshlq_n_u32(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
@@ -5372,14 +4717,14 @@ int64x2_t test_vshlq_n_u64(int64x2_t a) {
   return vshlq_n_u64(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_s8(
 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_s8(int8x8_t a) {
   return vshr_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
@@ -5388,7 +4733,7 @@ int16x4_t test_vshr_n_s16(int16x4_t a) {
   return vshr_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 3, i32 3>
@@ -5397,14 +4742,14 @@ int32x2_t test_vshr_n_s32(int32x2_t a) {
   return vshr_n_s32(a, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_s8(
 // CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   return vshrq_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -5413,7 +4758,7 @@ int16x8_t test_vshrq_n_s16(int16x8_t a) {
   return vshrq_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
@@ -5422,7 +4767,7 @@ int32x4_t test_vshrq_n_s32(int32x4_t a) {
   return vshrq_n_s32(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 3, i64 3>
@@ -5431,14 +4776,14 @@ int64x2_t test_vshrq_n_s64(int64x2_t a) {
   return vshrq_n_s64(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_u8(
 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_u8(int8x8_t a) {
   return vshr_n_u8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
@@ -5447,7 +4792,7 @@ int16x4_t test_vshr_n_u16(int16x4_t a) {
   return vshr_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 3, i32 3>
@@ -5456,14 +4801,14 @@ int32x2_t test_vshr_n_u32(int32x2_t a) {
   return vshr_n_u32(a, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_u8(
 // CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_u8(int8x16_t a) {
   return vshrq_n_u8(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -5472,7 +4817,7 @@ int16x8_t test_vshrq_n_u16(int16x8_t a) {
   return vshrq_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
@@ -5481,7 +4826,7 @@ int32x4_t test_vshrq_n_u32(int32x4_t a) {
   return vshrq_n_u32(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 3, i64 3>
@@ -5490,7 +4835,7 @@ int64x2_t test_vshrq_n_u64(int64x2_t a) {
   return vshrq_n_u64(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_s8(
 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
 // CHECK:   ret <8 x i8> [[TMP0]]
@@ -5498,7 +4843,7 @@ int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   return vsra_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -5510,7 +4855,7 @@ int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   return vsra_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -5522,7 +4867,7 @@ int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   return vsra_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_s8(
 // CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
 // CHECK:   ret <16 x i8> [[TMP0]]
@@ -5530,7 +4875,7 @@ int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vsraq_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -5542,7 +4887,7 @@ int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vsraq_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -5554,7 +4899,7 @@ int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vsraq_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -5566,7 +4911,7 @@ int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vsraq_n_s64(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_u8(
 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
 // CHECK:   ret <8 x i8> [[TMP0]]
@@ -5574,7 +4919,7 @@ int8x8_t test_vsra_n_u8(int8x8_t a, int8x8_t b) {
   return vsra_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -5586,7 +4931,7 @@ int16x4_t test_vsra_n_u16(int16x4_t a, int16x4_t b) {
   return vsra_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -5598,7 +4943,7 @@ int32x2_t test_vsra_n_u32(int32x2_t a, int32x2_t b) {
   return vsra_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_u8(
 // CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
 // CHECK:   ret <16 x i8> [[TMP0]]
@@ -5606,7 +4951,7 @@ int8x16_t test_vsraq_n_u8(int8x16_t a, int8x16_t b) {
   return vsraq_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -5618,7 +4963,7 @@ int16x8_t test_vsraq_n_u16(int16x8_t a, int16x8_t b) {
   return vsraq_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -5630,7 +4975,7 @@ int32x4_t test_vsraq_n_u32(int32x4_t a, int32x4_t b) {
   return vsraq_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -5642,14 +4987,14 @@ int64x2_t test_vsraq_n_u64(int64x2_t a, int64x2_t b) {
   return vsraq_n_u64(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_s8(
 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
 // CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   return vrshr_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
@@ -5658,7 +5003,7 @@ int16x4_t test_vrshr_n_s16(int16x4_t a) {
   return vrshr_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
@@ -5667,14 +5012,14 @@ int32x2_t test_vrshr_n_s32(int32x2_t a) {
   return vrshr_n_s32(a, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_s8(
 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
 // CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   return vrshrq_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
@@ -5683,7 +5028,7 @@ int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   return vrshrq_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
@@ -5692,7 +5037,7 @@ int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   return vrshrq_n_s32(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
@@ -5701,14 +5046,14 @@ int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   return vrshrq_n_s64(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_u8(
 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
 // CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_u8(int8x8_t a) {
   return vrshr_n_u8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
@@ -5717,7 +5062,7 @@ int16x4_t test_vrshr_n_u16(int16x4_t a) {
   return vrshr_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
@@ -5726,14 +5071,14 @@ int32x2_t test_vrshr_n_u32(int32x2_t a) {
   return vrshr_n_u32(a, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_u8(
 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
 // CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_u8(int8x16_t a) {
   return vrshrq_n_u8(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
@@ -5742,7 +5087,7 @@ int16x8_t test_vrshrq_n_u16(int16x8_t a) {
   return vrshrq_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
@@ -5751,7 +5096,7 @@ int32x4_t test_vrshrq_n_u32(int32x4_t a) {
   return vrshrq_n_u32(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
@@ -5760,7 +5105,7 @@ int64x2_t test_vrshrq_n_u64(int64x2_t a) {
   return vrshrq_n_u64(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_s8(
 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
 // CHECK:   ret <8 x i8> [[TMP0]]
@@ -5768,7 +5113,7 @@ int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   return vrsra_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -5780,7 +5125,7 @@ int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   return vrsra_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
@@ -5792,7 +5137,7 @@ int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   return vrsra_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_s8(
 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
 // CHECK:   ret <16 x i8> [[TMP0]]
@@ -5800,7 +5145,7 @@ int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vrsraq_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -5812,7 +5157,7 @@ int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vrsraq_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
@@ -5824,7 +5169,7 @@ int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vrsraq_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
@@ -5836,7 +5181,7 @@ int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vrsraq_n_s64(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_u8(
 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
 // CHECK:   ret <8 x i8> [[TMP0]]
@@ -5844,7 +5189,7 @@ int8x8_t test_vrsra_n_u8(int8x8_t a, int8x8_t b) {
   return vrsra_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -5856,7 +5201,7 @@ int16x4_t test_vrsra_n_u16(int16x4_t a, int16x4_t b) {
   return vrsra_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
@@ -5868,7 +5213,7 @@ int32x2_t test_vrsra_n_u32(int32x2_t a, int32x2_t b) {
   return vrsra_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_u8(
 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
 // CHECK:   ret <16 x i8> [[TMP0]]
@@ -5876,7 +5221,7 @@ int8x16_t test_vrsraq_n_u8(int8x16_t a, int8x16_t b) {
   return vrsraq_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -5888,7 +5233,7 @@ int16x8_t test_vrsraq_n_u16(int16x8_t a, int16x8_t b) {
   return vrsraq_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
@@ -5900,7 +5245,7 @@ int32x4_t test_vrsraq_n_u32(int32x4_t a, int32x4_t b) {
   return vrsraq_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
@@ -5912,14 +5257,14 @@ int64x2_t test_vrsraq_n_u64(int64x2_t a, int64x2_t b) {
   return vrsraq_n_u64(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_s8(
 // CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
 // CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   return vsri_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -5930,7 +5275,7 @@ int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   return vsri_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -5941,14 +5286,14 @@ int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   return vsri_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_s8(
 // CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
 // CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   return vsriq_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -5959,7 +5304,7 @@ int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   return vsriq_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -5970,7 +5315,7 @@ int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   return vsriq_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -5981,14 +5326,14 @@ int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   return vsriq_n_s64(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_u8(
 // CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
 // CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_u8(int8x8_t a, int8x8_t b) {
   return vsri_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -5999,7 +5344,7 @@ int16x4_t test_vsri_n_u16(int16x4_t a, int16x4_t b) {
   return vsri_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -6010,14 +5355,14 @@ int32x2_t test_vsri_n_u32(int32x2_t a, int32x2_t b) {
   return vsri_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_u8(
 // CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
 // CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_u8(int8x16_t a, int8x16_t b) {
   return vsriq_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -6028,7 +5373,7 @@ int16x8_t test_vsriq_n_u16(int16x8_t a, int16x8_t b) {
   return vsriq_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -6039,7 +5384,7 @@ int32x4_t test_vsriq_n_u32(int32x4_t a, int32x4_t b) {
   return vsriq_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -6050,14 +5395,14 @@ int64x2_t test_vsriq_n_u64(int64x2_t a, int64x2_t b) {
   return vsriq_n_u64(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_p8(
 // CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
 // CHECK:   ret <8 x i8> [[VSRI_N]]
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsri_n_p8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -6068,14 +5413,14 @@ poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsri_n_p16(a, b, 15);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_p8(
 // CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
 // CHECK:   ret <16 x i8> [[VSRI_N]]
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsriq_n_p8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -6086,14 +5431,14 @@ poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsriq_n_p16(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_s8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   return vsli_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -6104,7 +5449,7 @@ int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   return vsli_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -6115,14 +5460,14 @@ int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   return vsli_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_s8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   return vsliq_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -6133,7 +5478,7 @@ int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   return vsliq_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -6144,7 +5489,7 @@ int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   return vsliq_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -6155,14 +5500,14 @@ int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   return vsliq_n_s64(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_u8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsli_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -6173,7 +5518,7 @@ uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsli_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -6184,14 +5529,14 @@ uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsli_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_u8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsliq_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -6202,7 +5547,7 @@ uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsliq_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -6213,7 +5558,7 @@ uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsliq_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -6224,14 +5569,14 @@ uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsliq_n_u64(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_p8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsli_n_p8(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -6242,14 +5587,14 @@ poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsli_n_p16(a, b, 15);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_p8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsliq_n_p8(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -6260,14 +5605,14 @@ poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsliq_n_p16(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshlu_n_s8(
 // CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 // CHECK:   ret <8 x i8> [[VQSHLU_N]]
 int8x8_t test_vqshlu_n_s8(int8x8_t a) {
   return vqshlu_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshlu_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
@@ -6276,7 +5621,7 @@ int16x4_t test_vqshlu_n_s16(int16x4_t a) {
   return vqshlu_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshlu_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 3, i32 3>)
@@ -6285,14 +5630,14 @@ int32x2_t test_vqshlu_n_s32(int32x2_t a) {
   return vqshlu_n_s32(a, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshluq_n_s8(
 // CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 // CHECK:   ret <16 x i8> [[VQSHLU_N]]
 int8x16_t test_vqshluq_n_s8(int8x16_t a) {
   return vqshluq_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshluq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -6301,7 +5646,7 @@ int16x8_t test_vqshluq_n_s16(int16x8_t a) {
   return vqshluq_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshluq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
@@ -6310,7 +5655,7 @@ int32x4_t test_vqshluq_n_s32(int32x4_t a) {
   return vqshluq_n_s32(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshluq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 3, i64 3>)
@@ -6319,7 +5664,7 @@ int64x2_t test_vqshluq_n_s64(int64x2_t a) {
   return vqshluq_n_s64(a, 3);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -6329,7 +5674,7 @@ int8x8_t test_vshrn_n_s16(int16x8_t a) {
   return vshrn_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
@@ -6339,7 +5684,7 @@ int16x4_t test_vshrn_n_s32(int32x4_t a) {
   return vshrn_n_s32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
@@ -6349,7 +5694,7 @@ int32x2_t test_vshrn_n_s64(int64x2_t a) {
   return vshrn_n_s64(a, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -6359,7 +5704,7 @@ uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   return vshrn_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
@@ -6369,7 +5714,7 @@ uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   return vshrn_n_u32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
@@ -6379,7 +5724,7 @@ uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   return vshrn_n_u64(a, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshrn_high_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -6390,7 +5735,7 @@ int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   return vshrn_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshrn_high_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
@@ -6401,7 +5746,7 @@ int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   return vshrn_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshrn_high_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
@@ -6412,7 +5757,7 @@ int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   return vshrn_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshrn_high_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -6423,7 +5768,7 @@ uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   return vshrn_high_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshrn_high_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
@@ -6434,7 +5779,7 @@ uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   return vshrn_high_n_u32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshrn_high_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
@@ -6445,7 +5790,7 @@ uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   return vshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshrun_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
@@ -6454,7 +5799,7 @@ int8x8_t test_vqshrun_n_s16(int16x8_t a) {
   return vqshrun_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshrun_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
@@ -6463,7 +5808,7 @@ int16x4_t test_vqshrun_n_s32(int32x4_t a) {
   return vqshrun_n_s32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshrun_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
@@ -6472,7 +5817,7 @@ int32x2_t test_vqshrun_n_s64(int64x2_t a) {
   return vqshrun_n_s64(a, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshrun_high_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
@@ -6482,7 +5827,7 @@ int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) {
   return vqshrun_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshrun_high_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
@@ -6492,7 +5837,7 @@ int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) {
   return vqshrun_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshrun_high_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
@@ -6502,7 +5847,7 @@ int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) {
   return vqshrun_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
@@ -6511,7 +5856,7 @@ int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   return vrshrn_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
@@ -6520,7 +5865,7 @@ int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   return vrshrn_n_s32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
@@ -6529,7 +5874,7 @@ int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   return vrshrn_n_s64(a, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
@@ -6538,7 +5883,7 @@ uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   return vrshrn_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
@@ -6547,7 +5892,7 @@ uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   return vrshrn_n_u32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
@@ -6556,7 +5901,7 @@ uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   return vrshrn_n_u64(a, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshrn_high_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
@@ -6566,7 +5911,7 @@ int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   return vrshrn_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshrn_high_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
@@ -6576,7 +5921,7 @@ int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   return vrshrn_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshrn_high_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
@@ -6586,7 +5931,7 @@ int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   return vrshrn_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshrn_high_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
@@ -6596,7 +5941,7 @@ uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   return vrshrn_high_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshrn_high_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
@@ -6606,7 +5951,7 @@ uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   return vrshrn_high_n_u32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshrn_high_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
@@ -6616,7 +5961,7 @@ uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   return vrshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqrshrun_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
@@ -6625,7 +5970,7 @@ int8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   return vqrshrun_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqrshrun_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
@@ -6634,7 +5979,7 @@ int16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   return vqrshrun_n_s32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqrshrun_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
@@ -6643,7 +5988,7 @@ int32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   return vqrshrun_n_s64(a, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshrun_high_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
@@ -6653,7 +5998,7 @@ int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) {
   return vqrshrun_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshrun_high_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
@@ -6663,7 +6008,7 @@ int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) {
   return vqrshrun_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshrun_high_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
@@ -6673,7 +6018,7 @@ int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) {
   return vqrshrun_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
@@ -6682,7 +6027,7 @@ int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   return vqshrn_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
@@ -6691,7 +6036,7 @@ int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   return vqshrn_n_s32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
@@ -6700,7 +6045,7 @@ int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   return vqshrn_n_s64(a, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
@@ -6709,7 +6054,7 @@ uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   return vqshrn_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
@@ -6718,7 +6063,7 @@ uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   return vqshrn_n_u32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
@@ -6727,7 +6072,7 @@ uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   return vqshrn_n_u64(a, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshrn_high_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
@@ -6737,7 +6082,7 @@ int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   return vqshrn_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshrn_high_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
@@ -6747,7 +6092,7 @@ int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   return vqshrn_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshrn_high_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
@@ -6757,7 +6102,7 @@ int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   return vqshrn_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshrn_high_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
@@ -6767,7 +6112,7 @@ uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   return vqshrn_high_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshrn_high_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
@@ -6777,7 +6122,7 @@ uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   return vqshrn_high_n_u32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshrn_high_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
@@ -6787,7 +6132,7 @@ uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   return vqshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
@@ -6796,7 +6141,7 @@ int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   return vqrshrn_n_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
@@ -6805,7 +6150,7 @@ int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   return vqrshrn_n_s32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
@@ -6814,7 +6159,7 @@ int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   return vqrshrn_n_s64(a, 19);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
@@ -6823,7 +6168,7 @@ uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   return vqrshrn_n_u16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
@@ -6832,7 +6177,7 @@ uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   return vqrshrn_n_u32(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
@@ -6841,7 +6186,7 @@ uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   return vqrshrn_n_u64(a, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshrn_high_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
@@ -6851,7 +6196,7 @@ int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   return vqrshrn_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshrn_high_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
@@ -6861,7 +6206,7 @@ int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   return vqrshrn_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshrn_high_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
@@ -6871,7 +6216,7 @@ int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   return vqrshrn_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshrn_high_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
@@ -6881,7 +6226,7 @@ uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   return vqrshrn_high_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshrn_high_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
@@ -6891,7 +6236,7 @@ uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   return vqrshrn_high_n_u32(a, b, 9);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshrn_high_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
@@ -6901,7 +6246,7 @@ uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   return vqrshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s8(
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 // CHECK:   ret <8 x i16> [[VSHLL_N]]
@@ -6909,7 +6254,7 @@ int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
@@ -6919,7 +6264,7 @@ int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
@@ -6929,7 +6274,7 @@ int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 19);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u8(
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 // CHECK:   ret <8 x i16> [[VSHLL_N]]
@@ -6937,7 +6282,7 @@ uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
@@ -6947,7 +6292,7 @@ uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
@@ -6957,7 +6302,7 @@ uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 19);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -6966,7 +6311,7 @@ int16x8_t test_vshll_high_n_s8(int8x16_t a) {
   return vshll_high_n_s8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -6977,7 +6322,7 @@ int32x4_t test_vshll_high_n_s16(int16x8_t a) {
   return vshll_high_n_s16(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -6988,7 +6333,7 @@ int64x2_t test_vshll_high_n_s32(int32x4_t a) {
   return vshll_high_n_s32(a, 19);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -6997,7 +6342,7 @@ uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
   return vshll_high_n_u8(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -7008,7 +6353,7 @@ uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
   return vshll_high_n_u16(a, 9);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -7019,57 +6364,53 @@ uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
   return vshll_high_n_u32(a, 19);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmovl_s8(
 // CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
   return vmovl_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
   return vmovl_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
   return vmovl_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmovl_u8(
 // CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   return vmovl_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   return vmovl_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   return vmovl_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovl_high_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmovl_high_s8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
@@ -7077,27 +6418,25 @@ int16x8_t test_vmovl_high_s8(int8x16_t a) {
   return vmovl_high_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovl_high_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovl_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vmovl_high_s16(int16x8_t a) {
   return vmovl_high_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmovl_high_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovl_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vmovl_high_s32(int32x4_t a) {
   return vmovl_high_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovl_high_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmovl_high_u8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
@@ -7105,27 +6444,25 @@ uint16x8_t test_vmovl_high_u8(uint8x16_t a) {
   return vmovl_high_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovl_high_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovl_high_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vmovl_high_u16(uint16x8_t a) {
   return vmovl_high_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmovl_high_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovl_high_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 uint64x2_t test_vmovl_high_u32(uint32x4_t a) {
   return vmovl_high_u32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
@@ -7134,7 +6471,7 @@ float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   return vcvt_n_f32_s32(a, 31);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
@@ -7143,7 +6480,7 @@ float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   return vcvtq_n_f32_s32(a, 31);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_f64_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
@@ -7152,7 +6489,7 @@ float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) {
   return vcvtq_n_f64_s64(a, 50);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
@@ -7161,7 +6498,7 @@ float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   return vcvt_n_f32_u32(a, 31);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
@@ -7170,7 +6507,7 @@ float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   return vcvtq_n_f32_u32(a, 31);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_f64_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
@@ -7179,7 +6516,7 @@ float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) {
   return vcvtq_n_f64_u64(a, 50);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
@@ -7188,7 +6525,7 @@ int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   return vcvt_n_s32_f32(a, 31);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
@@ -7197,7 +6534,7 @@ int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   return vcvtq_n_s32_f32(a, 31);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
@@ -7206,7 +6543,7 @@ int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) {
   return vcvtq_n_s64_f64(a, 50);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
@@ -7215,7 +6552,7 @@ uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   return vcvt_n_u32_f32(a, 31);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
@@ -7224,7 +6561,7 @@ uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   return vcvtq_n_u32_f32(a, 31);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
@@ -7233,7 +6570,7 @@ uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) {
   return vcvtq_n_u64_f64(a, 50);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddl_s8(
 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
@@ -7242,33 +6579,29 @@ int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   return vaddl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   return vaddl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   return vaddl_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddl_u8(
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
@@ -7277,33 +6610,29 @@ uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   return vaddl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   return vaddl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   return vaddl_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddl_high_s8(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -7314,37 +6643,33 @@ int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) {
   return vaddl_high_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddl_high_s16(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) {
   return vaddl_high_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddl_high_s32(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) {
   return vaddl_high_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddl_high_u8(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -7355,37 +6680,33 @@ uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) {
   return vaddl_high_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddl_high_u16(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) {
   return vaddl_high_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddl_high_u32(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) {
   return vaddl_high_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddw_s8(
 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -7393,27 +6714,25 @@ int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   return vaddw_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddw_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   return vaddw_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddw_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   return vaddw_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddw_u8(
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -7421,27 +6740,25 @@ uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   return vaddw_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddw_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   return vaddw_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddw_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   return vaddw_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddw_high_s8(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
@@ -7450,29 +6767,27 @@ int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) {
   return vaddw_high_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddw_high_s16(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) {
   return vaddw_high_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddw_high_s32(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) {
   return vaddw_high_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddw_high_u8(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
@@ -7481,29 +6796,27 @@ uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) {
   return vaddw_high_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddw_high_u16(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) {
   return vaddw_high_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddw_high_u32(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) {
   return vaddw_high_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubl_s8(
 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
@@ -7512,33 +6825,29 @@ int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   return vsubl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   return vsubl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   return vsubl_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubl_u8(
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
@@ -7547,33 +6856,29 @@ uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   return vsubl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   return vsubl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   return vsubl_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubl_high_s8(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -7584,37 +6889,33 @@ int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) {
   return vsubl_high_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubl_high_s16(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) {
   return vsubl_high_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubl_high_s32(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) {
   return vsubl_high_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubl_high_u8(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -7625,37 +6926,33 @@ uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) {
   return vsubl_high_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubl_high_u16(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) {
   return vsubl_high_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubl_high_u32(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) {
   return vsubl_high_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubw_s8(
 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -7663,27 +6960,25 @@ int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   return vsubw_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubw_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   return vsubw_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubw_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   return vsubw_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubw_u8(
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -7691,27 +6986,25 @@ uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   return vsubw_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubw_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   return vsubw_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubw_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   return vsubw_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubw_high_s8(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
@@ -7720,29 +7013,27 @@ int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) {
   return vsubw_high_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubw_high_s16(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) {
   return vsubw_high_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubw_high_s32(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) {
   return vsubw_high_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubw_high_u8(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
@@ -7751,34 +7042,30 @@ uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) {
   return vsubw_high_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubw_high_u16(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) {
   return vsubw_high_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubw_high_u32(
 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) {
   return vsubw_high_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
@@ -7786,12 +7073,10 @@ int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   return vaddhn_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
@@ -7799,12 +7084,10 @@ int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   return vaddhn_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
@@ -7812,12 +7095,10 @@ int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   return vaddhn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
@@ -7825,12 +7106,10 @@ uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vaddhn_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
@@ -7838,12 +7117,10 @@ uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vaddhn_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
@@ -7851,12 +7128,10 @@ uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vaddhn_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_high_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b
 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -7865,12 +7140,10 @@ int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   return vaddhn_high_s16(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_high_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b
 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -7879,12 +7152,10 @@ int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   return vaddhn_high_s32(r, a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_high_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b
 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -7893,12 +7164,10 @@ int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   return vaddhn_high_s64(r, a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_high_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b
 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -7907,12 +7176,10 @@ uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   return vaddhn_high_u16(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_high_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b
 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -7921,12 +7188,10 @@ uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   return vaddhn_high_u32(r, a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_high_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b
 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -7935,166 +7200,132 @@ uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   return vaddhn_high_u64(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   return vraddhn_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   return vraddhn_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   return vraddhn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vraddhn_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vraddhn_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vraddhn_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_high_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   return vraddhn_high_s16(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_high_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   return vraddhn_high_s32(r, a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_high_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   return vraddhn_high_s64(r, a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_high_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   return vraddhn_high_u16(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_high_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   return vraddhn_high_u32(r, a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_high_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   return vraddhn_high_u64(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
@@ -8102,12 +7333,10 @@ int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   return vsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
@@ -8115,12 +7344,10 @@ int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   return vsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
@@ -8128,12 +7355,10 @@ int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   return vsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
@@ -8141,12 +7366,10 @@ uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
@@ -8154,12 +7377,10 @@ uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
@@ -8167,12 +7388,10 @@ uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vsubhn_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_high_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b
 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -8181,12 +7400,10 @@ int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   return vsubhn_high_s16(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_high_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b
 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -8195,12 +7412,10 @@ int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   return vsubhn_high_s32(r, a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_high_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b
 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -8209,12 +7424,10 @@ int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   return vsubhn_high_s64(r, a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_high_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b
 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -8223,12 +7436,10 @@ uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   return vsubhn_high_u16(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_high_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b
 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -8237,12 +7448,10 @@ uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   return vsubhn_high_u32(r, a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_high_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b
 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -8251,228 +7460,189 @@ uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   return vsubhn_high_u64(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   return vrsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   return vrsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   return vrsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vrsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vrsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vrsubhn_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_high_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   return vrsubhn_high_s16(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_high_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   return vrsubhn_high_s32(r, a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_high_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   return vrsubhn_high_s64(r, a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_high_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   return vrsubhn_high_u16(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_high_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   return vrsubhn_high_u32(r, a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_high_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   return vrsubhn_high_u64(r, a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vabdl_s8(
 // CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   return vabdl_s8(a, b);
 }
-// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   return vabdl_s16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   return vabdl_s32(a, b);
 }
-// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_u8(
 // CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   return vabdl_u8(a, b);
 }
-// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   return vabdl_u16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   return vabdl_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vabal_s8(
 // CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
@@ -8480,35 +7650,32 @@ uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vabal_s8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vabal_s16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vabal_s32(a, b, c);
 }
-// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_u8(
 // CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
@@ -8516,36 +7683,32 @@ int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vabal_u8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vabal_u16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vabal_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vabdl_high_s8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -8554,37 +7717,34 @@ uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) {
   return vabdl_high_s8(a, b);
 }
-// CHECK-LABEL: define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
 int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) {
   return vabdl_high_s16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
 int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) {
   return vabdl_high_s32(a, b);
 }
-// CHECK-LABEL: define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_high_u8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -8593,38 +7753,34 @@ int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) {
 uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) {
   return vabdl_high_u8(a, b);
 }
-// CHECK-LABEL: define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_high_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
 uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) {
   return vabdl_high_u16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vabdl_high_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
 uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) {
   return vabdl_high_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vabal_high_s8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -8634,39 +7790,36 @@ uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) {
 int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   return vabal_high_s8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32>
 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vabal_high_s16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64>
 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vabal_high_s32(a, b, c);
 }
-// CHECK-LABEL: define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_high_u8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -8676,93 +7829,86 @@ int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
 uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   return vabal_high_u8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_high_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32>
 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   return vabal_high_u16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vabal_high_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64>
 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   return vabal_high_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmull_s8(
 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   return vmull_s8(a, b);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   return vmull_s16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   return vmull_s32(a, b);
 }
-// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_u8(
 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   return vmull_u8(a, b);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_u16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmull_high_s8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -8770,31 +7916,30 @@ uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
 int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) {
   return vmull_high_s8(a, b);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   ret <4 x i32> [[VMULL2_I_I]]
 int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) {
   return vmull_high_s16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   ret <2 x i64> [[VMULL2_I_I]]
 int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) {
   return vmull_high_s32(a, b);
 }
-// CHECK-LABEL: define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_high_u8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -8802,91 +7947,86 @@ int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) {
 uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) {
   return vmull_high_u8(a, b);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_high_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   ret <4 x i32> [[VMULL2_I_I]]
 uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) {
   return vmull_high_u16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vmull_high_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   ret <2 x i64> [[VMULL2_I_I]]
 uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) {
   return vmull_high_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlal_s8(
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlal_s8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_s16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_s32(a, b, c);
 }
-// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_u8(
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlal_u8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_u16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlal_high_s8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -8895,33 +8035,32 @@ uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   return vmlal_high_s8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vmlal_high_s16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vmlal_high_s32(a, b, c);
 }
-// CHECK-LABEL: define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_high_u8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -8930,93 +8069,88 @@ int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
 uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   return vmlal_high_u8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_high_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   return vmlal_high_u16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vmlal_high_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   return vmlal_high_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_s8(
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlsl_s8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_s16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_s32(a, b, c);
 }
-// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_u8(
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlsl_u8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_u16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_high_s8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -9025,33 +8159,32 @@ uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   return vmlsl_high_s8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vmlsl_high_s16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vmlsl_high_s32(a, b, c);
 }
-// CHECK-LABEL: define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_high_u8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -9060,215 +8193,179 @@ int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
 uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   return vmlsl_high_u8(a, b, c);
 }
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_high_u16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   return vmlsl_high_u16(a, b, c);
 }
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+
+// CHECK-LABEL: @test_vmlsl_high_u32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   return vmlsl_high_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmull_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_s16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vqdmull_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmull_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) #4
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I_I]]
 int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) {
   return vqdmull_high_s16(a, b);
 }
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vqdmull_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) #4
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
 // CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I_I]]
 int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) {
   return vqdmull_high_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4
-// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I_I]]
 int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vqdmlal_high_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4
-// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I_I]]
 int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vqdmlal_high_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_s16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4
-// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I_I]]
 int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vqdmlsl_high_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_high_s32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4
-// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I_I]]
 int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vqdmlsl_high_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmull_p8(
 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   return vmull_p8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmull_high_p8(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
@@ -9277,35 +8374,35 @@ poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) {
   return vmull_high_p8(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vaddd_s64(
 // CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
 // CHECK:   ret i64 [[VADDD_I]]
 int64_t test_vaddd_s64(int64_t a, int64_t b) {
   return vaddd_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vaddd_u64(
 // CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
 // CHECK:   ret i64 [[VADDD_I]]
 uint64_t test_vaddd_u64(uint64_t a, uint64_t b) {
   return vaddd_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vsubd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vsubd_s64(
 // CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
 // CHECK:   ret i64 [[VSUBD_I]]
 int64_t test_vsubd_s64(int64_t a, int64_t b) {
   return vsubd_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vsubd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vsubd_u64(
 // CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
 // CHECK:   ret i64 [[VSUBD_I]]
 uint64_t test_vsubd_u64(uint64_t a, uint64_t b) {
   return vsubd_u64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqaddb_s8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vqaddb_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -9315,7 +8412,7 @@ int8_t test_vqaddb_s8(int8_t a, int8_t b) {
   return vqaddb_s8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqaddh_s16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqaddh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9325,21 +8422,21 @@ int16_t test_vqaddh_s16(int16_t a, int16_t b) {
   return vqaddh_s16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqadds_s32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqadds_s32(
 // CHECK:   [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQADDS_S32_I]]
 int32_t test_vqadds_s32(int32_t a, int32_t b) {
   return vqadds_s32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vqaddd_s64(
 // CHECK:   [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VQADDD_S64_I]]
 int64_t test_vqaddd_s64(int64_t a, int64_t b) {
   return vqaddd_s64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqaddb_u8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vqaddb_u8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -9349,7 +8446,7 @@ uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
   return vqaddb_u8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqaddh_u16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqaddh_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9359,21 +8456,21 @@ uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
   return vqaddh_u16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqadds_u32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqadds_u32(
 // CHECK:   [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQADDS_U32_I]]
 uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
   return vqadds_u32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vqaddd_u64(
 // CHECK:   [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VQADDD_U64_I]]
 uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
   return vqaddd_u64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqsubb_s8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vqsubb_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -9383,7 +8480,7 @@ int8_t test_vqsubb_s8(int8_t a, int8_t b) {
   return vqsubb_s8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqsubh_s16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqsubh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9393,21 +8490,21 @@ int16_t test_vqsubh_s16(int16_t a, int16_t b) {
   return vqsubh_s16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqsubs_s32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqsubs_s32(
 // CHECK:   [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQSUBS_S32_I]]
 int32_t test_vqsubs_s32(int32_t a, int32_t b) {
   return vqsubs_s32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqsubd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vqsubd_s64(
 // CHECK:   [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VQSUBD_S64_I]]
 int64_t test_vqsubd_s64(int64_t a, int64_t b) {
   return vqsubd_s64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqsubb_u8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vqsubb_u8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -9417,7 +8514,7 @@ uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
   return vqsubb_u8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqsubh_u16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqsubh_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9427,35 +8524,35 @@ uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
   return vqsubh_u16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqsubs_u32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqsubs_u32(
 // CHECK:   [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQSUBS_U32_I]]
 uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
   return vqsubs_u32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqsubd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vqsubd_u64(
 // CHECK:   [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VQSUBD_U64_I]]
 uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
   return vqsubd_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vshld_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vshld_s64(
 // CHECK:   [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VSHLD_S64_I]]
 int64_t test_vshld_s64(int64_t a, int64_t b) {
   return vshld_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vshld_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vshld_u64(
 // CHECK:   [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VSHLD_U64_I]]
 uint64_t test_vshld_u64(uint64_t a, uint64_t b) {
   return vshld_u64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqshlb_s8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vqshlb_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -9465,7 +8562,7 @@ int8_t test_vqshlb_s8(int8_t a, int8_t b) {
   return vqshlb_s8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqshlh_s16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqshlh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9475,21 +8572,21 @@ int16_t test_vqshlh_s16(int16_t a, int16_t b) {
   return vqshlh_s16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqshls_s32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqshls_s32(
 // CHECK:   [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQSHLS_S32_I]]
 int32_t test_vqshls_s32(int32_t a, int32_t b) {
   return vqshls_s32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqshld_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vqshld_s64(
 // CHECK:   [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VQSHLD_S64_I]]
 int64_t test_vqshld_s64(int64_t a, int64_t b) {
   return vqshld_s64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqshlb_u8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vqshlb_u8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -9499,7 +8596,7 @@ uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) {
   return vqshlb_u8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqshlh_u16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqshlh_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9509,36 +8606,35 @@ uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) {
   return vqshlh_u16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqshls_u32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqshls_u32(
 // CHECK:   [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQSHLS_U32_I]]
 uint32_t test_vqshls_u32(uint32_t a, uint32_t b) {
   return vqshls_u32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqshld_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vqshld_u64(
 // CHECK:   [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VQSHLD_U64_I]]
 uint64_t test_vqshld_u64(uint64_t a, uint64_t b) {
   return vqshld_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vrshld_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vrshld_s64(
 // CHECK:   [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VRSHLD_S64_I]]
 int64_t test_vrshld_s64(int64_t a, int64_t b) {
   return vrshld_s64(a, b);
 }
 
-
-// CHECK-LABEL: define i64 @test_vrshld_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vrshld_u64(
 // CHECK:   [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VRSHLD_U64_I]]
 uint64_t test_vrshld_u64(uint64_t a, uint64_t b) {
   return vrshld_u64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqrshlb_s8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vqrshlb_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -9548,7 +8644,7 @@ int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
   return vqrshlb_s8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqrshlh_s16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqrshlh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9558,21 +8654,21 @@ int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
   return vqrshlh_s16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqrshls_s32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqrshls_s32(
 // CHECK:   [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQRSHLS_S32_I]]
 int32_t test_vqrshls_s32(int32_t a, int32_t b) {
   return vqrshls_s32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqrshld_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vqrshld_s64(
 // CHECK:   [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VQRSHLD_S64_I]]
 int64_t test_vqrshld_s64(int64_t a, int64_t b) {
   return vqrshld_s64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqrshlb_u8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vqrshlb_u8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -9582,7 +8678,7 @@ uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) {
   return vqrshlb_u8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqrshlh_u16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqrshlh_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9592,124 +8688,113 @@ uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) {
   return vqrshlh_u16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqrshls_u32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqrshls_u32(
 // CHECK:   [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQRSHLS_U32_I]]
 uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) {
   return vqrshls_u32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqrshld_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vqrshld_u64(
 // CHECK:   [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VQRSHLD_U64_I]]
 uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) {
   return vqrshld_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vpaddd_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vpaddd_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) #4
 // CHECK:   ret i64 [[VPADDD_S64_I]]
 int64_t test_vpaddd_s64(int64x2_t a) {
   return vpaddd_s64(a);
 }
 
-// CHECK-LABEL: define float @test_vpadds_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vpadds_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 0
-// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 1
+// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x float> %a, i64 0
+// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x float> %a, i64 1
 // CHECK:   [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
 // CHECK:   ret float [[VPADDD_I]]
 float32_t test_vpadds_f32(float32x2_t a) {
   return vpadds_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vpaddd_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vpaddd_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
-// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x double> %a, i64 0
+// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x double> %a, i64 1
 // CHECK:   [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
 // CHECK:   ret double [[VPADDD_I]]
 float64_t test_vpaddd_f64(float64x2_t a) {
   return vpaddd_f64(a);
 }
 
-// CHECK-LABEL: define float @test_vpmaxnms_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vpmaxnms_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VPMAXNMS_F32_I]]
 float32_t test_vpmaxnms_f32(float32x2_t a) {
   return vpmaxnms_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vpmaxnmqd_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vpmaxnmqd_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VPMAXNMQD_F64_I]]
 float64_t test_vpmaxnmqd_f64(float64x2_t a) {
   return vpmaxnmqd_f64(a);
 }
 
-// CHECK-LABEL: define float @test_vpmaxs_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vpmaxs_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VPMAXS_F32_I]]
 float32_t test_vpmaxs_f32(float32x2_t a) {
   return vpmaxs_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vpmaxqd_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vpmaxqd_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VPMAXQD_F64_I]]
 float64_t test_vpmaxqd_f64(float64x2_t a) {
   return vpmaxqd_f64(a);
 }
 
-// CHECK-LABEL: define float @test_vpminnms_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vpminnms_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VPMINNMS_F32_I]]
 float32_t test_vpminnms_f32(float32x2_t a) {
   return vpminnms_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vpminnmqd_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vpminnmqd_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VPMINNMQD_F64_I]]
 float64_t test_vpminnmqd_f64(float64x2_t a) {
   return vpminnmqd_f64(a);
 }
 
-// CHECK-LABEL: define float @test_vpmins_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vpmins_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VPMINS_F32_I]]
 float32_t test_vpmins_f32(float32x2_t a) {
   return vpmins_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vpminqd_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vpminqd_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VPMINQD_F64_I]]
 float64_t test_vpminqd_f64(float64x2_t a) {
   return vpminqd_f64(a);
 }
 
-// CHECK-LABEL: define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqdmulhh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9719,14 +8804,14 @@ int16_t test_vqdmulhh_s16(int16_t a, int16_t b) {
   return vqdmulhh_s16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmulhs_s32(
 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_s32(int32_t a, int32_t b) {
   return vqdmulhs_s32(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -9736,155 +8821,151 @@ int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) {
   return vqrdmulhh_s16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhs_s32(
 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) {
   return vqrdmulhs_s32(a, b);
 }
 
-// CHECK-LABEL: define float @test_vmulxs_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vmulxs_f32(
 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) #4
 // CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_f32(float32_t a, float32_t b) {
   return vmulxs_f32(a, b);
 }
 
-// CHECK-LABEL: define double @test_vmulxd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vmulxd_f64(
 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) #4
 // CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_f64(float64_t a, float64_t b) {
   return vmulxd_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmulx_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vmulx_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> [[VMULX_I]], <1 x double> [[VMULX1_I]]) #4
+// CHECK:   [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   ret <1 x double> [[VMULX2_I]]
 float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) {
   return vmulx_f64(a, b);
 }
 
-// CHECK-LABEL: define float @test_vrecpss_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vrecpss_f32(
 // CHECK:   [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b) #4
 // CHECK:   ret float [[VRECPS_I]]
 float32_t test_vrecpss_f32(float32_t a, float32_t b) {
   return vrecpss_f32(a, b);
 }
 
-// CHECK-LABEL: define double @test_vrecpsd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vrecpsd_f64(
 // CHECK:   [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b) #4
 // CHECK:   ret double [[VRECPS_I]]
 float64_t test_vrecpsd_f64(float64_t a, float64_t b) {
   return vrecpsd_f64(a, b);
 }
 
-// CHECK-LABEL: define float @test_vrsqrtss_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vrsqrtss_f32(
 // CHECK:   [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) #4
 // CHECK:   ret float [[VRSQRTSS_F32_I]]
 float32_t test_vrsqrtss_f32(float32_t a, float32_t b) {
   return vrsqrtss_f32(a, b);
 }
 
-// CHECK-LABEL: define double @test_vrsqrtsd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vrsqrtsd_f64(
 // CHECK:   [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) #4
 // CHECK:   ret double [[VRSQRTSD_F64_I]]
 float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) {
   return vrsqrtsd_f64(a, b);
 }
 
-// CHECK-LABEL: define float @test_vcvts_f32_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vcvts_f32_s32(
 // CHECK:   [[TMP0:%.*]] = sitofp i32 %a to float
 // CHECK:   ret float [[TMP0]]
 float32_t test_vcvts_f32_s32(int32_t a) {
   return vcvts_f32_s32(a);
 }
 
-// CHECK-LABEL: define double @test_vcvtd_f64_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcvtd_f64_s64(
 // CHECK:   [[TMP0:%.*]] = sitofp i64 %a to double
 // CHECK:   ret double [[TMP0]]
 float64_t test_vcvtd_f64_s64(int64_t a) {
   return vcvtd_f64_s64(a);
 }
 
-// CHECK-LABEL: define float @test_vcvts_f32_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vcvts_f32_u32(
 // CHECK:   [[TMP0:%.*]] = uitofp i32 %a to float
 // CHECK:   ret float [[TMP0]]
 float32_t test_vcvts_f32_u32(uint32_t a) {
   return vcvts_f32_u32(a);
 }
 
-// CHECK-LABEL: define double @test_vcvtd_f64_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcvtd_f64_u64(
 // CHECK:   [[TMP0:%.*]] = uitofp i64 %a to double
 // CHECK:   ret double [[TMP0]]
 float64_t test_vcvtd_f64_u64(uint64_t a) {
   return vcvtd_f64_u64(a);
 }
 
-// CHECK-LABEL: define float @test_vrecpes_f32(float %a) #0 {
+// CHECK-LABEL: @test_vrecpes_f32(
 // CHECK:   [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a) #4
 // CHECK:   ret float [[VRECPES_F32_I]]
 float32_t test_vrecpes_f32(float32_t a) {
   return vrecpes_f32(a);
 }
- 
-// CHECK-LABEL: define double @test_vrecped_f64(double %a) #0 {
+
+// CHECK-LABEL: @test_vrecped_f64(
 // CHECK:   [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a) #4
 // CHECK:   ret double [[VRECPED_F64_I]]
 float64_t test_vrecped_f64(float64_t a) {
   return vrecped_f64(a);
 }
- 
-// CHECK-LABEL: define float @test_vrecpxs_f32(float %a) #0 {
+
+// CHECK-LABEL: @test_vrecpxs_f32(
 // CHECK:   [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a) #4
 // CHECK:   ret float [[VRECPXS_F32_I]]
 float32_t test_vrecpxs_f32(float32_t a) {
   return vrecpxs_f32(a);
- }
- 
-// CHECK-LABEL: define double @test_vrecpxd_f64(double %a) #0 {
+}
+
+// CHECK-LABEL: @test_vrecpxd_f64(
 // CHECK:   [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a) #4
 // CHECK:   ret double [[VRECPXD_F64_I]]
 float64_t test_vrecpxd_f64(float64_t a) {
   return vrecpxd_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrsqrte_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %a) #4
 // CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   return vrsqrte_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrsqrteq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %a) #4
 // CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   return vrsqrteq_u32(a);
 }
 
-// CHECK-LABEL: define float @test_vrsqrtes_f32(float %a) #0 {
+// CHECK-LABEL: @test_vrsqrtes_f32(
 // CHECK:   [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a) #4
 // CHECK:   ret float [[VRSQRTES_F32_I]]
 float32_t test_vrsqrtes_f32(float32_t a) {
   return vrsqrtes_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vrsqrted_f64(double %a) #0 {
+// CHECK-LABEL: @test_vrsqrted_f64(
 // CHECK:   [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a) #4
 // CHECK:   ret double [[VRSQRTED_F64_I]]
 float64_t test_vrsqrted_f64(float64_t a) {
   return vrsqrted_f64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
 // CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
 // CHECK:   ret <16 x i8> [[TMP1]]
@@ -9892,7 +8973,7 @@ uint8x16_t test_vld1q_u8(uint8_t const *a) {
   return vld1q_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
@@ -9901,7 +8982,7 @@ uint16x8_t test_vld1q_u16(uint16_t const *a) {
   return vld1q_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
@@ -9910,7 +8991,7 @@ uint32x4_t test_vld1q_u32(uint32_t const *a) {
   return vld1q_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
 // CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
@@ -9919,7 +9000,7 @@ uint64x2_t test_vld1q_u64(uint64_t const *a) {
   return vld1q_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
 // CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
 // CHECK:   ret <16 x i8> [[TMP1]]
@@ -9927,7 +9008,7 @@ int8x16_t test_vld1q_s8(int8_t const *a) {
   return vld1q_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
@@ -9936,7 +9017,7 @@ int16x8_t test_vld1q_s16(int16_t const *a) {
   return vld1q_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
@@ -9945,7 +9026,7 @@ int32x4_t test_vld1q_s32(int32_t const *a) {
   return vld1q_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
 // CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
@@ -9954,7 +9035,7 @@ int64x2_t test_vld1q_s64(int64_t const *a) {
   return vld1q_s64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
@@ -9964,7 +9045,7 @@ float16x8_t test_vld1q_f16(float16_t const *a) {
   return vld1q_f16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
 // CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
@@ -9973,7 +9054,7 @@ float32x4_t test_vld1q_f32(float32_t const *a) {
   return vld1q_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vld1q_f64(double* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
 // CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]]
@@ -9982,7 +9063,7 @@ float64x2_t test_vld1q_f64(float64_t const *a) {
   return vld1q_f64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
 // CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
 // CHECK:   ret <16 x i8> [[TMP1]]
@@ -9990,7 +9071,7 @@ poly8x16_t test_vld1q_p8(poly8_t const *a) {
   return vld1q_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
@@ -9999,7 +9080,7 @@ poly16x8_t test_vld1q_p16(poly16_t const *a) {
   return vld1q_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
 // CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
 // CHECK:   ret <8 x i8> [[TMP1]]
@@ -10007,7 +9088,7 @@ uint8x8_t test_vld1_u8(uint8_t const *a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
@@ -10016,7 +9097,7 @@ uint16x4_t test_vld1_u16(uint16_t const *a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
@@ -10025,7 +9106,7 @@ uint32x2_t test_vld1_u32(uint32_t const *a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
 // CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
@@ -10034,7 +9115,7 @@ uint64x1_t test_vld1_u64(uint64_t const *a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
 // CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
 // CHECK:   ret <8 x i8> [[TMP1]]
@@ -10042,7 +9123,7 @@ int8x8_t test_vld1_s8(int8_t const *a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
@@ -10051,7 +9132,7 @@ int16x4_t test_vld1_s16(int16_t const *a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
@@ -10060,7 +9141,7 @@ int32x2_t test_vld1_s32(int32_t const *a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
 // CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
@@ -10069,7 +9150,7 @@ int64x1_t test_vld1_s64(int64_t const *a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
@@ -10079,7 +9160,7 @@ float16x4_t test_vld1_f16(float16_t const *a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld1_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
 // CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
@@ -10088,7 +9169,7 @@ float32x2_t test_vld1_f32(float32_t const *a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vld1_f64(double* %a) #0 {
+// CHECK-LABEL: @test_vld1_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
 // CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]]
@@ -10097,7 +9178,7 @@ float64x1_t test_vld1_f64(float64_t const *a) {
   return vld1_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
 // CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
 // CHECK:   ret <8 x i8> [[TMP1]]
@@ -10105,7 +9186,7 @@ poly8x8_t test_vld1_p8(poly8_t const *a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
@@ -10114,7 +9195,7 @@ poly16x4_t test_vld1_p16(poly16_t const *a) {
   return vld1_p16(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld2q_u8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
@@ -10131,7 +9212,7 @@ uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
   return vld2q_u8(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld2q_u16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
@@ -10149,7 +9230,7 @@ uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
   return vld2q_u16(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld2q_u32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
@@ -10167,7 +9248,7 @@ uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
   return vld2q_u32(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld2q_u64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
@@ -10185,7 +9266,7 @@ uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
   return vld2q_u64(a);
 }
 
-// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld2q_s8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
@@ -10202,7 +9283,7 @@ int8x16x2_t test_vld2q_s8(int8_t const *a) {
   return vld2q_s8(a);
 }
 
-// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld2q_s16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
@@ -10220,7 +9301,7 @@ int16x8x2_t test_vld2q_s16(int16_t const *a) {
   return vld2q_s16(a);
 }
 
-// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld2q_s32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
@@ -10238,7 +9319,7 @@ int32x4x2_t test_vld2q_s32(int32_t const *a) {
   return vld2q_s32(a);
 }
 
-// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld2q_s64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
@@ -10256,7 +9337,7 @@ int64x2x2_t test_vld2q_s64(int64_t const *a) {
   return vld2q_s64(a);
 }
 
-// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld2q_f16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
@@ -10274,7 +9355,7 @@ float16x8x2_t test_vld2q_f16(float16_t const *a) {
   return vld2q_f16(a);
 }
 
-// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld2q_f32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
@@ -10292,7 +9373,7 @@ float32x4x2_t test_vld2q_f32(float32_t const *a) {
   return vld2q_f32(a);
 }
 
-// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_f64(double* %a) #0 {
+// CHECK-LABEL: @test_vld2q_f64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
@@ -10310,7 +9391,7 @@ float64x2x2_t test_vld2q_f64(float64_t const *a) {
   return vld2q_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld2q_p8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
@@ -10327,7 +9408,7 @@ poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
   return vld2q_p8(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld2q_p16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
@@ -10345,7 +9426,7 @@ poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
   return vld2q_p16(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_u8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
@@ -10362,7 +9443,7 @@ uint8x8x2_t test_vld2_u8(uint8_t const *a) {
   return vld2_u8(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_u16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
@@ -10380,7 +9461,7 @@ uint16x4x2_t test_vld2_u16(uint16_t const *a) {
   return vld2_u16(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld2_u32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
@@ -10398,7 +9479,7 @@ uint32x2x2_t test_vld2_u32(uint32_t const *a) {
   return vld2_u32(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld2_u64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
@@ -10416,7 +9497,7 @@ uint64x1x2_t test_vld2_u64(uint64_t const *a) {
   return vld2_u64(a);
 }
 
-// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_s8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
@@ -10433,7 +9514,7 @@ int8x8x2_t test_vld2_s8(int8_t const *a) {
   return vld2_s8(a);
 }
 
-// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_s16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
@@ -10451,7 +9532,7 @@ int16x4x2_t test_vld2_s16(int16_t const *a) {
   return vld2_s16(a);
 }
 
-// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld2_s32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
@@ -10469,7 +9550,7 @@ int32x2x2_t test_vld2_s32(int32_t const *a) {
   return vld2_s32(a);
 }
 
-// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld2_s64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
@@ -10487,7 +9568,7 @@ int64x1x2_t test_vld2_s64(int64_t const *a) {
   return vld2_s64(a);
 }
 
-// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld2_f16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
@@ -10505,7 +9586,7 @@ float16x4x2_t test_vld2_f16(float16_t const *a) {
   return vld2_f16(a);
 }
 
-// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld2_f32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
@@ -10523,7 +9604,7 @@ float32x2x2_t test_vld2_f32(float32_t const *a) {
   return vld2_f32(a);
 }
 
-// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_f64(double* %a) #0 {
+// CHECK-LABEL: @test_vld2_f64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
@@ -10541,7 +9622,7 @@ float64x1x2_t test_vld2_f64(float64_t const *a) {
   return vld2_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_p8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
@@ -10558,7 +9639,7 @@ poly8x8x2_t test_vld2_p8(poly8_t const *a) {
   return vld2_p8(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_p16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
@@ -10576,7 +9657,7 @@ poly16x4x2_t test_vld2_p16(poly16_t const *a) {
   return vld2_p16(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld3q_u8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
@@ -10593,7 +9674,7 @@ uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
   return vld3q_u8(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld3q_u16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
@@ -10611,7 +9692,7 @@ uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
   return vld3q_u16(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld3q_u32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
@@ -10629,7 +9710,7 @@ uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
   return vld3q_u32(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld3q_u64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
@@ -10647,7 +9728,7 @@ uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
   return vld3q_u64(a);
 }
 
-// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld3q_s8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
@@ -10664,7 +9745,7 @@ int8x16x3_t test_vld3q_s8(int8_t const *a) {
   return vld3q_s8(a);
 }
 
-// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld3q_s16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
@@ -10682,7 +9763,7 @@ int16x8x3_t test_vld3q_s16(int16_t const *a) {
   return vld3q_s16(a);
 }
 
-// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld3q_s32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
@@ -10700,7 +9781,7 @@ int32x4x3_t test_vld3q_s32(int32_t const *a) {
   return vld3q_s32(a);
 }
 
-// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld3q_s64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
@@ -10718,7 +9799,7 @@ int64x2x3_t test_vld3q_s64(int64_t const *a) {
   return vld3q_s64(a);
 }
 
-// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld3q_f16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
@@ -10736,7 +9817,7 @@ float16x8x3_t test_vld3q_f16(float16_t const *a) {
   return vld3q_f16(a);
 }
 
-// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld3q_f32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
@@ -10754,7 +9835,7 @@ float32x4x3_t test_vld3q_f32(float32_t const *a) {
   return vld3q_f32(a);
 }
 
-// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_f64(double* %a) #0 {
+// CHECK-LABEL: @test_vld3q_f64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
@@ -10772,7 +9853,7 @@ float64x2x3_t test_vld3q_f64(float64_t const *a) {
   return vld3q_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld3q_p8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
@@ -10789,7 +9870,7 @@ poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
   return vld3q_p8(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld3q_p16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
@@ -10807,7 +9888,7 @@ poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
   return vld3q_p16(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_u8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
@@ -10824,7 +9905,7 @@ uint8x8x3_t test_vld3_u8(uint8_t const *a) {
   return vld3_u8(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_u16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
@@ -10842,7 +9923,7 @@ uint16x4x3_t test_vld3_u16(uint16_t const *a) {
   return vld3_u16(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld3_u32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
@@ -10860,7 +9941,7 @@ uint32x2x3_t test_vld3_u32(uint32_t const *a) {
   return vld3_u32(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld3_u64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
@@ -10878,7 +9959,7 @@ uint64x1x3_t test_vld3_u64(uint64_t const *a) {
   return vld3_u64(a);
 }
 
-// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_s8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
@@ -10895,7 +9976,7 @@ int8x8x3_t test_vld3_s8(int8_t const *a) {
   return vld3_s8(a);
 }
 
-// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_s16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
@@ -10913,7 +9994,7 @@ int16x4x3_t test_vld3_s16(int16_t const *a) {
   return vld3_s16(a);
 }
 
-// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld3_s32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
@@ -10931,7 +10012,7 @@ int32x2x3_t test_vld3_s32(int32_t const *a) {
   return vld3_s32(a);
 }
 
-// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld3_s64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
@@ -10949,7 +10030,7 @@ int64x1x3_t test_vld3_s64(int64_t const *a) {
   return vld3_s64(a);
 }
 
-// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld3_f16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
@@ -10967,7 +10048,7 @@ float16x4x3_t test_vld3_f16(float16_t const *a) {
   return vld3_f16(a);
 }
 
-// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld3_f32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
@@ -10985,7 +10066,7 @@ float32x2x3_t test_vld3_f32(float32_t const *a) {
   return vld3_f32(a);
 }
 
-// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_f64(double* %a) #0 {
+// CHECK-LABEL: @test_vld3_f64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
@@ -11003,7 +10084,7 @@ float64x1x3_t test_vld3_f64(float64_t const *a) {
   return vld3_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_p8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
@@ -11020,7 +10101,7 @@ poly8x8x3_t test_vld3_p8(poly8_t const *a) {
   return vld3_p8(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_p16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
@@ -11038,7 +10119,7 @@ poly16x4x3_t test_vld3_p16(poly16_t const *a) {
   return vld3_p16(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld4q_u8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
@@ -11055,7 +10136,7 @@ uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
   return vld4q_u8(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld4q_u16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
@@ -11073,7 +10154,7 @@ uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
   return vld4q_u16(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld4q_u32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
@@ -11091,7 +10172,7 @@ uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
   return vld4q_u32(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld4q_u64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
@@ -11109,7 +10190,7 @@ uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
   return vld4q_u64(a);
 }
 
-// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld4q_s8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
@@ -11126,7 +10207,7 @@ int8x16x4_t test_vld4q_s8(int8_t const *a) {
   return vld4q_s8(a);
 }
 
-// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld4q_s16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
@@ -11144,7 +10225,7 @@ int16x8x4_t test_vld4q_s16(int16_t const *a) {
   return vld4q_s16(a);
 }
 
-// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld4q_s32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
@@ -11162,7 +10243,7 @@ int32x4x4_t test_vld4q_s32(int32_t const *a) {
   return vld4q_s32(a);
 }
 
-// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld4q_s64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
@@ -11180,7 +10261,7 @@ int64x2x4_t test_vld4q_s64(int64_t const *a) {
   return vld4q_s64(a);
 }
 
-// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld4q_f16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
@@ -11198,7 +10279,7 @@ float16x8x4_t test_vld4q_f16(float16_t const *a) {
   return vld4q_f16(a);
 }
 
-// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld4q_f32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
@@ -11216,7 +10297,7 @@ float32x4x4_t test_vld4q_f32(float32_t const *a) {
   return vld4q_f32(a);
 }
 
-// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_f64(double* %a) #0 {
+// CHECK-LABEL: @test_vld4q_f64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
@@ -11234,7 +10315,7 @@ float64x2x4_t test_vld4q_f64(float64_t const *a) {
   return vld4q_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld4q_p8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
@@ -11251,7 +10332,7 @@ poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
   return vld4q_p8(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld4q_p16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
@@ -11269,7 +10350,7 @@ poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
   return vld4q_p16(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_u8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
@@ -11286,7 +10367,7 @@ uint8x8x4_t test_vld4_u8(uint8_t const *a) {
   return vld4_u8(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_u16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
@@ -11304,7 +10385,7 @@ uint16x4x4_t test_vld4_u16(uint16_t const *a) {
   return vld4_u16(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld4_u32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
@@ -11322,7 +10403,7 @@ uint32x2x4_t test_vld4_u32(uint32_t const *a) {
   return vld4_u32(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld4_u64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
@@ -11340,7 +10421,7 @@ uint64x1x4_t test_vld4_u64(uint64_t const *a) {
   return vld4_u64(a);
 }
 
-// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_s8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
@@ -11357,7 +10438,7 @@ int8x8x4_t test_vld4_s8(int8_t const *a) {
   return vld4_s8(a);
 }
 
-// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_s16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
@@ -11375,7 +10456,7 @@ int16x4x4_t test_vld4_s16(int16_t const *a) {
   return vld4_s16(a);
 }
 
-// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld4_s32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
@@ -11393,7 +10474,7 @@ int32x2x4_t test_vld4_s32(int32_t const *a) {
   return vld4_s32(a);
 }
 
-// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld4_s64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
@@ -11411,7 +10492,7 @@ int64x1x4_t test_vld4_s64(int64_t const *a) {
   return vld4_s64(a);
 }
 
-// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld4_f16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
@@ -11429,7 +10510,7 @@ float16x4x4_t test_vld4_f16(float16_t const *a) {
   return vld4_f16(a);
 }
 
-// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld4_f32(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
@@ -11447,7 +10528,7 @@ float32x2x4_t test_vld4_f32(float32_t const *a) {
   return vld4_f32(a);
 }
 
-// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_f64(double* %a) #0 {
+// CHECK-LABEL: @test_vld4_f64(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
@@ -11465,7 +10546,7 @@ float64x1x4_t test_vld4_f64(float64_t const *a) {
   return vld4_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_p8(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
@@ -11482,7 +10563,7 @@ poly8x8x4_t test_vld4_p8(poly8_t const *a) {
   return vld4_p8(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_p16(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
@@ -11500,7 +10581,7 @@ poly16x4x4_t test_vld4_p16(poly16_t const *a) {
   return vld4_p16(a);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
 // CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
 // CHECK:   ret void
@@ -11508,7 +10589,7 @@ void test_vst1q_u8(uint8_t *a, uint8x16_t b) {
   vst1q_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -11519,7 +10600,7 @@ void test_vst1q_u16(uint16_t *a, uint16x8_t b) {
   vst1q_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1q_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -11530,7 +10611,7 @@ void test_vst1q_u32(uint32_t *a, uint32x4_t b) {
   vst1q_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1q_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
@@ -11541,7 +10622,7 @@ void test_vst1q_u64(uint64_t *a, uint64x2_t b) {
   vst1q_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
 // CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
 // CHECK:   ret void
@@ -11549,7 +10630,7 @@ void test_vst1q_s8(int8_t *a, int8x16_t b) {
   vst1q_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -11560,7 +10641,7 @@ void test_vst1q_s16(int16_t *a, int16x8_t b) {
   vst1q_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1q_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -11571,7 +10652,7 @@ void test_vst1q_s32(int32_t *a, int32x4_t b) {
   vst1q_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1q_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
@@ -11582,7 +10663,7 @@ void test_vst1q_s64(int64_t *a, int64x2_t b) {
   vst1q_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
+// CHECK-LABEL: @test_vst1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -11593,7 +10674,7 @@ void test_vst1q_f16(float16_t *a, float16x8_t b) {
   vst1q_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vst1q_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
@@ -11604,7 +10685,7 @@ void test_vst1q_f32(float32_t *a, float32x4_t b) {
   vst1q_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f64(double* %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vst1q_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
@@ -11615,7 +10696,7 @@ void test_vst1q_f64(float64_t *a, float64x2_t b) {
   vst1q_f64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
 // CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
 // CHECK:   ret void
@@ -11623,7 +10704,7 @@ void test_vst1q_p8(poly8_t *a, poly8x16_t b) {
   vst1q_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -11634,7 +10715,7 @@ void test_vst1q_p16(poly16_t *a, poly16x8_t b) {
   vst1q_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
 // CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
 // CHECK:   ret void
@@ -11642,7 +10723,7 @@ void test_vst1_u8(uint8_t *a, uint8x8_t b) {
   vst1_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -11653,7 +10734,7 @@ void test_vst1_u16(uint16_t *a, uint16x4_t b) {
   vst1_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -11664,7 +10745,7 @@ void test_vst1_u32(uint32_t *a, uint32x2_t b) {
   vst1_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
@@ -11675,7 +10756,7 @@ void test_vst1_u64(uint64_t *a, uint64x1_t b) {
   vst1_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
 // CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
 // CHECK:   ret void
@@ -11683,7 +10764,7 @@ void test_vst1_s8(int8_t *a, int8x8_t b) {
   vst1_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -11694,7 +10775,7 @@ void test_vst1_s16(int16_t *a, int16x4_t b) {
   vst1_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -11705,7 +10786,7 @@ void test_vst1_s32(int32_t *a, int32x2_t b) {
   vst1_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
@@ -11716,7 +10797,7 @@ void test_vst1_s64(int64_t *a, int64x1_t b) {
   vst1_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
+// CHECK-LABEL: @test_vst1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -11727,7 +10808,7 @@ void test_vst1_f16(float16_t *a, float16x4_t b) {
   vst1_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vst1_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
@@ -11738,7 +10819,7 @@ void test_vst1_f32(float32_t *a, float32x2_t b) {
   vst1_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f64(double* %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vst1_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
@@ -11749,7 +10830,7 @@ void test_vst1_f64(float64_t *a, float64x1_t b) {
   vst1_f64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
 // CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
 // CHECK:   ret void
@@ -11757,7 +10838,7 @@ void test_vst1_p8(poly8_t *a, poly8x8_t b) {
   vst1_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -11768,7 +10849,7 @@ void test_vst1_p16(poly16_t *a, poly16x4_t b) {
   vst1_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
@@ -11788,7 +10869,7 @@ void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
   vst2q_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
@@ -11813,7 +10894,7 @@ void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
   vst2q_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
@@ -11838,7 +10919,7 @@ void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
   vst2q_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
@@ -11863,7 +10944,7 @@ void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
   vst2q_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
@@ -11883,7 +10964,7 @@ void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
   vst2q_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
@@ -11908,7 +10989,7 @@ void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
   vst2q_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
@@ -11933,7 +11014,7 @@ void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
   vst2q_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
@@ -11958,7 +11039,7 @@ void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
   vst2q_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
@@ -11983,7 +11064,7 @@ void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
   vst2q_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
@@ -12008,7 +11089,7 @@ void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
   vst2q_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_f64(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
@@ -12033,7 +11114,7 @@ void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
   vst2q_f64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
@@ -12053,7 +11134,7 @@ void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
   vst2q_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
@@ -12078,7 +11159,7 @@ void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
   vst2q_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
@@ -12098,7 +11179,7 @@ void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
   vst2_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
@@ -12123,7 +11204,7 @@ void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
   vst2_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
@@ -12148,7 +11229,7 @@ void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
   vst2_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
@@ -12173,7 +11254,7 @@ void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
   vst2_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
@@ -12193,7 +11274,7 @@ void test_vst2_s8(int8_t *a, int8x8x2_t b) {
   vst2_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
@@ -12218,7 +11299,7 @@ void test_vst2_s16(int16_t *a, int16x4x2_t b) {
   vst2_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
@@ -12243,7 +11324,7 @@ void test_vst2_s32(int32_t *a, int32x2x2_t b) {
   vst2_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
@@ -12268,7 +11349,7 @@ void test_vst2_s64(int64_t *a, int64x1x2_t b) {
   vst2_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
@@ -12293,7 +11374,7 @@ void test_vst2_f16(float16_t *a, float16x4x2_t b) {
   vst2_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
@@ -12318,7 +11399,7 @@ void test_vst2_f32(float32_t *a, float32x2x2_t b) {
   vst2_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_f64(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
@@ -12343,7 +11424,7 @@ void test_vst2_f64(float64_t *a, float64x1x2_t b) {
   vst2_f64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
@@ -12363,7 +11444,7 @@ void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
   vst2_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
@@ -12388,7 +11469,7 @@ void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
   vst2_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
@@ -12411,7 +11492,7 @@ void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
   vst3q_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
@@ -12441,7 +11522,7 @@ void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
   vst3q_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
@@ -12471,7 +11552,7 @@ void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
   vst3q_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
@@ -12501,7 +11582,7 @@ void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
   vst3q_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
@@ -12524,7 +11605,7 @@ void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
   vst3q_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
@@ -12554,7 +11635,7 @@ void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
   vst3q_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
@@ -12584,7 +11665,7 @@ void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
   vst3q_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
@@ -12614,7 +11695,7 @@ void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
   vst3q_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
@@ -12644,7 +11725,7 @@ void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
   vst3q_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
@@ -12674,7 +11755,7 @@ void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
   vst3q_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_f64(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
@@ -12704,7 +11785,7 @@ void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
   vst3q_f64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
@@ -12727,7 +11808,7 @@ void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
   vst3q_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
@@ -12757,7 +11838,7 @@ void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
   vst3q_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
@@ -12780,7 +11861,7 @@ void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
   vst3_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
@@ -12810,7 +11891,7 @@ void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
   vst3_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
@@ -12840,7 +11921,7 @@ void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
   vst3_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
@@ -12870,7 +11951,7 @@ void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
   vst3_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
@@ -12893,7 +11974,7 @@ void test_vst3_s8(int8_t *a, int8x8x3_t b) {
   vst3_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
@@ -12923,7 +12004,7 @@ void test_vst3_s16(int16_t *a, int16x4x3_t b) {
   vst3_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
@@ -12953,7 +12034,7 @@ void test_vst3_s32(int32_t *a, int32x2x3_t b) {
   vst3_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
@@ -12983,7 +12064,7 @@ void test_vst3_s64(int64_t *a, int64x1x3_t b) {
   vst3_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
@@ -13013,7 +12094,7 @@ void test_vst3_f16(float16_t *a, float16x4x3_t b) {
   vst3_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
@@ -13043,7 +12124,7 @@ void test_vst3_f32(float32_t *a, float32x2x3_t b) {
   vst3_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_f64(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
@@ -13073,7 +12154,7 @@ void test_vst3_f64(float64_t *a, float64x1x3_t b) {
   vst3_f64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
@@ -13096,7 +12177,7 @@ void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
   vst3_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
@@ -13126,7 +12207,7 @@ void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
   vst3_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
@@ -13152,7 +12233,7 @@ void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
   vst4q_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
@@ -13187,7 +12268,7 @@ void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
   vst4q_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
@@ -13222,7 +12303,7 @@ void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
   vst4q_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
@@ -13257,7 +12338,7 @@ void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
   vst4q_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
@@ -13283,7 +12364,7 @@ void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
   vst4q_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
@@ -13318,7 +12399,7 @@ void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
   vst4q_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
@@ -13353,7 +12434,7 @@ void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
   vst4q_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
@@ -13388,7 +12469,7 @@ void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
   vst4q_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
@@ -13423,7 +12504,7 @@ void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
   vst4q_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
@@ -13458,7 +12539,7 @@ void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
   vst4q_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_f64(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
@@ -13493,7 +12574,7 @@ void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
   vst4q_f64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
@@ -13519,7 +12600,7 @@ void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
   vst4q_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
@@ -13554,7 +12635,7 @@ void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
   vst4q_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
@@ -13580,7 +12661,7 @@ void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
   vst4_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
@@ -13615,7 +12696,7 @@ void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
   vst4_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
@@ -13650,7 +12731,7 @@ void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
   vst4_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
@@ -13685,7 +12766,7 @@ void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
   vst4_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
@@ -13711,7 +12792,7 @@ void test_vst4_s8(int8_t *a, int8x8x4_t b) {
   vst4_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
@@ -13746,7 +12827,7 @@ void test_vst4_s16(int16_t *a, int16x4x4_t b) {
   vst4_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
@@ -13781,7 +12862,7 @@ void test_vst4_s32(int32_t *a, int32x2x4_t b) {
   vst4_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
@@ -13816,7 +12897,7 @@ void test_vst4_s64(int64_t *a, int64x1x4_t b) {
   vst4_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
@@ -13851,7 +12932,7 @@ void test_vst4_f16(float16_t *a, float16x4x4_t b) {
   vst4_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
@@ -13886,7 +12967,7 @@ void test_vst4_f32(float32_t *a, float32x2x4_t b) {
   vst4_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_f64(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
@@ -13921,7 +13002,7 @@ void test_vst4_f64(float64_t *a, float64x1x4_t b) {
   vst4_f64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
@@ -13947,7 +13028,7 @@ void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
   vst4_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
@@ -13982,7 +13063,7 @@ void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
   vst4_p16(a, b);
 }
 
-// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld1q_u8_x2(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u8_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
@@ -13998,7 +13079,7 @@ uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
   return vld1q_u8_x2(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld1q_u16_x2(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u16_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
@@ -14016,7 +13097,7 @@ uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
   return vld1q_u16_x2(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld1q_u32_x2(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u32_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
@@ -14034,7 +13115,7 @@ uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
   return vld1q_u32_x2(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld1q_u64_x2(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
@@ -14052,7 +13133,7 @@ uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
   return vld1q_u64_x2(a);
 }
 
-// CHECK-LABEL: define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s8_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
@@ -14068,7 +13149,7 @@ int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
   return vld1q_s8_x2(a);
 }
 
-// CHECK-LABEL: define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s16_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
@@ -14086,7 +13167,7 @@ int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
   return vld1q_s16_x2(a);
 }
 
-// CHECK-LABEL: define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s32_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
@@ -14104,7 +13185,7 @@ int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
   return vld1q_s32_x2(a);
 }
 
-// CHECK-LABEL: define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
@@ -14122,7 +13203,7 @@ int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
   return vld1q_s64_x2(a);
 }
 
-// CHECK-LABEL: define %struct.float16x8x2_t @test_vld1q_f16_x2(half* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f16_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
@@ -14140,7 +13221,7 @@ float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
   return vld1q_f16_x2(a);
 }
 
-// CHECK-LABEL: define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f32_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
@@ -14158,7 +13239,7 @@ float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
   return vld1q_f32_x2(a);
 }
 
-// CHECK-LABEL: define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
@@ -14176,7 +13257,7 @@ float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
   return vld1q_f64_x2(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld1q_p8_x2(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p8_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
@@ -14192,7 +13273,7 @@ poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
   return vld1q_p8_x2(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld1q_p16_x2(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p16_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
@@ -14210,7 +13291,7 @@ poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
   return vld1q_p16_x2(a);
 }
 
-// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld1q_p64_x2(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
@@ -14228,7 +13309,7 @@ poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
   return vld1q_p64_x2(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld1_u8_x2(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_u8_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
@@ -14244,7 +13325,7 @@ uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
   return vld1_u8_x2(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld1_u16_x2(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_u16_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
@@ -14262,7 +13343,7 @@ uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
   return vld1_u16_x2(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld1_u32_x2(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_u32_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
@@ -14280,7 +13361,7 @@ uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
   return vld1_u32_x2(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld1_u64_x2(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_u64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
@@ -14298,7 +13379,7 @@ uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
   return vld1_u64_x2(a);
 }
 
-// CHECK-LABEL: define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_s8_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
@@ -14314,7 +13395,7 @@ int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
   return vld1_s8_x2(a);
 }
 
-// CHECK-LABEL: define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_s16_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
@@ -14332,7 +13413,7 @@ int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
   return vld1_s16_x2(a);
 }
 
-// CHECK-LABEL: define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_s32_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
@@ -14350,7 +13431,7 @@ int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
   return vld1_s32_x2(a);
 }
 
-// CHECK-LABEL: define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_s64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
@@ -14368,7 +13449,7 @@ int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
   return vld1_s64_x2(a);
 }
 
-// CHECK-LABEL: define %struct.float16x4x2_t @test_vld1_f16_x2(half* %a) #0 {
+// CHECK-LABEL: @test_vld1_f16_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
@@ -14386,7 +13467,7 @@ float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
   return vld1_f16_x2(a);
 }
 
-// CHECK-LABEL: define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a) #0 {
+// CHECK-LABEL: @test_vld1_f32_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
@@ -14404,7 +13485,7 @@ float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
   return vld1_f32_x2(a);
 }
 
-// CHECK-LABEL: define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a) #0 {
+// CHECK-LABEL: @test_vld1_f64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
@@ -14422,7 +13503,7 @@ float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
   return vld1_f64_x2(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld1_p8_x2(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_p8_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
@@ -14438,7 +13519,7 @@ poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
   return vld1_p8_x2(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld1_p16_x2(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_p16_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
@@ -14456,7 +13537,7 @@ poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
   return vld1_p16_x2(a);
 }
 
-// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld1_p64_x2(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_p64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
@@ -14474,7 +13555,7 @@ poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
   return vld1_p64_x2(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld1q_u8_x3(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u8_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
@@ -14490,7 +13571,7 @@ uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
   return vld1q_u8_x3(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld1q_u16_x3(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u16_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
@@ -14508,7 +13589,7 @@ uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
   return vld1q_u16_x3(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld1q_u32_x3(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u32_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
@@ -14526,7 +13607,7 @@ uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
   return vld1q_u32_x3(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld1q_u64_x3(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
@@ -14544,7 +13625,7 @@ uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
   return vld1q_u64_x3(a);
 }
 
-// CHECK-LABEL: define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s8_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
@@ -14560,7 +13641,7 @@ int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
   return vld1q_s8_x3(a);
 }
 
-// CHECK-LABEL: define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s16_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
@@ -14578,7 +13659,7 @@ int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
   return vld1q_s16_x3(a);
 }
 
-// CHECK-LABEL: define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s32_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
@@ -14596,7 +13677,7 @@ int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
   return vld1q_s32_x3(a);
 }
 
-// CHECK-LABEL: define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
@@ -14614,7 +13695,7 @@ int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
   return vld1q_s64_x3(a);
 }
 
-// CHECK-LABEL: define %struct.float16x8x3_t @test_vld1q_f16_x3(half* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f16_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
@@ -14632,7 +13713,7 @@ float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
   return vld1q_f16_x3(a);
 }
 
-// CHECK-LABEL: define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f32_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
@@ -14650,7 +13731,7 @@ float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
   return vld1q_f32_x3(a);
 }
 
-// CHECK-LABEL: define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
@@ -14668,7 +13749,7 @@ float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
   return vld1q_f64_x3(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld1q_p8_x3(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p8_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
@@ -14684,7 +13765,7 @@ poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
   return vld1q_p8_x3(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld1q_p16_x3(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p16_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
@@ -14702,7 +13783,7 @@ poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
   return vld1q_p16_x3(a);
 }
 
-// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld1q_p64_x3(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
@@ -14720,7 +13801,7 @@ poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
   return vld1q_p64_x3(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld1_u8_x3(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_u8_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
@@ -14736,7 +13817,7 @@ uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
   return vld1_u8_x3(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld1_u16_x3(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_u16_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
@@ -14754,7 +13835,7 @@ uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
   return vld1_u16_x3(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld1_u32_x3(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_u32_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
@@ -14772,7 +13853,7 @@ uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
   return vld1_u32_x3(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld1_u64_x3(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_u64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
@@ -14790,7 +13871,7 @@ uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
   return vld1_u64_x3(a);
 }
 
-// CHECK-LABEL: define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_s8_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
@@ -14806,7 +13887,7 @@ int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
   return vld1_s8_x3(a);
 }
 
-// CHECK-LABEL: define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_s16_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
@@ -14824,7 +13905,7 @@ int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
   return vld1_s16_x3(a);
 }
 
-// CHECK-LABEL: define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_s32_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
@@ -14842,7 +13923,7 @@ int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
   return vld1_s32_x3(a);
 }
 
-// CHECK-LABEL: define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_s64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
@@ -14860,7 +13941,7 @@ int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
   return vld1_s64_x3(a);
 }
 
-// CHECK-LABEL: define %struct.float16x4x3_t @test_vld1_f16_x3(half* %a) #0 {
+// CHECK-LABEL: @test_vld1_f16_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
@@ -14878,7 +13959,7 @@ float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
   return vld1_f16_x3(a);
 }
 
-// CHECK-LABEL: define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a) #0 {
+// CHECK-LABEL: @test_vld1_f32_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
@@ -14896,7 +13977,7 @@ float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
   return vld1_f32_x3(a);
 }
 
-// CHECK-LABEL: define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a) #0 {
+// CHECK-LABEL: @test_vld1_f64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
@@ -14914,7 +13995,7 @@ float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
   return vld1_f64_x3(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld1_p8_x3(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_p8_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
@@ -14930,7 +14011,7 @@ poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
   return vld1_p8_x3(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld1_p16_x3(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_p16_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
@@ -14948,7 +14029,7 @@ poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
   return vld1_p16_x3(a);
 }
 
-// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld1_p64_x3(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_p64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
@@ -14966,7 +14047,7 @@ poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
   return vld1_p64_x3(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld1q_u8_x4(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u8_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
@@ -14982,7 +14063,7 @@ uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
   return vld1q_u8_x4(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld1q_u16_x4(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u16_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
@@ -15000,7 +14081,7 @@ uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
   return vld1q_u16_x4(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld1q_u32_x4(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u32_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
@@ -15018,7 +14099,7 @@ uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
   return vld1q_u32_x4(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld1q_u64_x4(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
@@ -15036,7 +14117,7 @@ uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
   return vld1q_u64_x4(a);
 }
 
-// CHECK-LABEL: define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s8_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
@@ -15052,7 +14133,7 @@ int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
   return vld1q_s8_x4(a);
 }
 
-// CHECK-LABEL: define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s16_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
@@ -15070,7 +14151,7 @@ int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
   return vld1q_s16_x4(a);
 }
 
-// CHECK-LABEL: define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s32_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
@@ -15088,7 +14169,7 @@ int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
   return vld1q_s32_x4(a);
 }
 
-// CHECK-LABEL: define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
@@ -15106,7 +14187,7 @@ int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
   return vld1q_s64_x4(a);
 }
 
-// CHECK-LABEL: define %struct.float16x8x4_t @test_vld1q_f16_x4(half* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f16_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
@@ -15124,7 +14205,7 @@ float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
   return vld1q_f16_x4(a);
 }
 
-// CHECK-LABEL: define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f32_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
@@ -15142,7 +14223,7 @@ float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
   return vld1q_f32_x4(a);
 }
 
-// CHECK-LABEL: define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
@@ -15160,7 +14241,7 @@ float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
   return vld1q_f64_x4(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld1q_p8_x4(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p8_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
@@ -15176,7 +14257,7 @@ poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
   return vld1q_p8_x4(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld1q_p16_x4(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p16_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
@@ -15194,7 +14275,7 @@ poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
   return vld1q_p16_x4(a);
 }
 
-// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld1q_p64_x4(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
@@ -15212,7 +14293,7 @@ poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
   return vld1q_p64_x4(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld1_u8_x4(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_u8_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
@@ -15228,7 +14309,7 @@ uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
   return vld1_u8_x4(a);
 }
 
-// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld1_u16_x4(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_u16_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
@@ -15246,7 +14327,7 @@ uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
   return vld1_u16_x4(a);
 }
 
-// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld1_u32_x4(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_u32_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
@@ -15264,7 +14345,7 @@ uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
   return vld1_u32_x4(a);
 }
 
-// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld1_u64_x4(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_u64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
@@ -15282,7 +14363,7 @@ uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
   return vld1_u64_x4(a);
 }
 
-// CHECK-LABEL: define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_s8_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
@@ -15298,7 +14379,7 @@ int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
   return vld1_s8_x4(a);
 }
 
-// CHECK-LABEL: define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_s16_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
@@ -15316,7 +14397,7 @@ int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
   return vld1_s16_x4(a);
 }
 
-// CHECK-LABEL: define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_s32_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
@@ -15334,7 +14415,7 @@ int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
   return vld1_s32_x4(a);
 }
 
-// CHECK-LABEL: define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_s64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
@@ -15352,7 +14433,7 @@ int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
   return vld1_s64_x4(a);
 }
 
-// CHECK-LABEL: define %struct.float16x4x4_t @test_vld1_f16_x4(half* %a) #0 {
+// CHECK-LABEL: @test_vld1_f16_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
@@ -15370,7 +14451,7 @@ float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
   return vld1_f16_x4(a);
 }
 
-// CHECK-LABEL: define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a) #0 {
+// CHECK-LABEL: @test_vld1_f32_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
@@ -15388,7 +14469,7 @@ float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
   return vld1_f32_x4(a);
 }
 
-// CHECK-LABEL: define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a) #0 {
+// CHECK-LABEL: @test_vld1_f64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
@@ -15406,7 +14487,7 @@ float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
   return vld1_f64_x4(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld1_p8_x4(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_p8_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
@@ -15422,7 +14503,7 @@ poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
   return vld1_p8_x4(a);
 }
 
-// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld1_p16_x4(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_p16_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
@@ -15440,7 +14521,7 @@ poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
   return vld1_p16_x4(a);
 }
 
-// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld1_p64_x4(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_p64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
@@ -15458,7 +14539,7 @@ poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
   return vld1_p64_x4(a);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u8_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
@@ -15478,7 +14559,7 @@ void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
   vst1q_u8_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u16_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
@@ -15504,7 +14585,7 @@ void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
   vst1q_u16_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u32_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
@@ -15530,7 +14611,7 @@ void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
   vst1q_u32_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
@@ -15556,7 +14637,7 @@ void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
   vst1q_u64_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s8_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
@@ -15576,7 +14657,7 @@ void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
   vst1q_s8_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s16_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
@@ -15602,7 +14683,7 @@ void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
   vst1q_s16_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s32_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
@@ -15628,7 +14709,7 @@ void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
   vst1q_s32_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
@@ -15654,7 +14735,7 @@ void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
   vst1q_s64_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f16_x2(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f16_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
@@ -15680,7 +14761,7 @@ void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
   vst1q_f16_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f32_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
@@ -15706,7 +14787,7 @@ void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
   vst1q_f32_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
@@ -15732,7 +14813,7 @@ void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
   vst1q_f64_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p8_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
@@ -15752,7 +14833,7 @@ void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
   vst1q_p8_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p16_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
@@ -15778,7 +14859,7 @@ void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
   vst1q_p16_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
@@ -15804,7 +14885,7 @@ void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
   vst1q_p64_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u8_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
@@ -15824,7 +14905,7 @@ void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
   vst1_u8_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u16_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
@@ -15850,7 +14931,7 @@ void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
   vst1_u16_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u32_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
@@ -15876,7 +14957,7 @@ void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
   vst1_u32_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
@@ -15902,7 +14983,7 @@ void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
   vst1_u64_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s8_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
@@ -15922,7 +15003,7 @@ void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
   vst1_s8_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s16_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
@@ -15948,7 +15029,7 @@ void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
   vst1_s16_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s32_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
@@ -15974,7 +15055,7 @@ void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
   vst1_s32_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
@@ -16000,7 +15081,7 @@ void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
   vst1_s64_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f16_x2(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f16_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
@@ -16026,7 +15107,7 @@ void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
   vst1_f16_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f32_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
@@ -16052,7 +15133,7 @@ void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
   vst1_f32_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
@@ -16078,7 +15159,7 @@ void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
   vst1_f64_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p8_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
@@ -16098,7 +15179,7 @@ void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
   vst1_p8_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p16_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
@@ -16124,7 +15205,7 @@ void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
   vst1_p16_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
@@ -16150,7 +15231,7 @@ void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
   vst1_p64_x2(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u8_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
@@ -16173,7 +15254,7 @@ void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
   vst1q_u8_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u16_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
@@ -16204,7 +15285,7 @@ void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
   vst1q_u16_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u32_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
@@ -16235,7 +15316,7 @@ void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
   vst1q_u32_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
@@ -16266,7 +15347,7 @@ void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
   vst1q_u64_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s8_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
@@ -16289,7 +15370,7 @@ void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
   vst1q_s8_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s16_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
@@ -16320,7 +15401,7 @@ void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
   vst1q_s16_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s32_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
@@ -16351,7 +15432,7 @@ void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
   vst1q_s32_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
@@ -16382,7 +15463,7 @@ void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
   vst1q_s64_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f16_x3(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f16_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
@@ -16413,7 +15494,7 @@ void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
   vst1q_f16_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f32_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
@@ -16444,7 +15525,7 @@ void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
   vst1q_f32_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
@@ -16475,7 +15556,7 @@ void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
   vst1q_f64_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p8_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
@@ -16498,7 +15579,7 @@ void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
   vst1q_p8_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p16_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
@@ -16529,7 +15610,7 @@ void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
   vst1q_p16_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
@@ -16560,7 +15641,7 @@ void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
   vst1q_p64_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u8_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
@@ -16583,7 +15664,7 @@ void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
   vst1_u8_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u16_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
@@ -16614,7 +15695,7 @@ void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
   vst1_u16_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u32_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
@@ -16645,7 +15726,7 @@ void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
   vst1_u32_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
@@ -16676,7 +15757,7 @@ void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
   vst1_u64_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s8_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
@@ -16699,7 +15780,7 @@ void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
   vst1_s8_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s16_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
@@ -16730,7 +15811,7 @@ void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
   vst1_s16_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s32_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
@@ -16761,7 +15842,7 @@ void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
   vst1_s32_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
@@ -16792,7 +15873,7 @@ void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
   vst1_s64_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f16_x3(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f16_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
@@ -16823,7 +15904,7 @@ void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
   vst1_f16_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f32_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
@@ -16854,7 +15935,7 @@ void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
   vst1_f32_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
@@ -16885,7 +15966,7 @@ void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
   vst1_f64_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p8_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
@@ -16908,7 +15989,7 @@ void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
   vst1_p8_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p16_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
@@ -16939,7 +16020,7 @@ void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
   vst1_p16_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
@@ -16970,7 +16051,7 @@ void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
   vst1_p64_x3(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u8_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
@@ -16996,7 +16077,7 @@ void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
   vst1q_u8_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u16_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
@@ -17032,7 +16113,7 @@ void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
   vst1q_u16_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u32_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
@@ -17068,7 +16149,7 @@ void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
   vst1q_u32_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_u64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
@@ -17104,7 +16185,7 @@ void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
   vst1q_u64_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s8_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
@@ -17130,7 +16211,7 @@ void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
   vst1q_s8_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s16_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
@@ -17166,7 +16247,7 @@ void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
   vst1q_s16_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s32_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
@@ -17202,7 +16283,7 @@ void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
   vst1q_s32_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_s64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
@@ -17238,7 +16319,7 @@ void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
   vst1q_s64_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f16_x4(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f16_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
@@ -17274,7 +16355,7 @@ void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
   vst1q_f16_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f32_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
@@ -17310,7 +16391,7 @@ void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
   vst1q_f32_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_f64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
@@ -17346,7 +16427,7 @@ void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
   vst1q_f64_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p8_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
@@ -17372,7 +16453,7 @@ void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
   vst1q_p8_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p16_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
@@ -17408,7 +16489,7 @@ void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
   vst1q_p16_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1q_p64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
@@ -17444,7 +16525,7 @@ void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
   vst1q_p64_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u8_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
@@ -17470,7 +16551,7 @@ void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
   vst1_u8_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u16_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
@@ -17506,7 +16587,7 @@ void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
   vst1_u16_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u32_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
@@ -17542,7 +16623,7 @@ void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
   vst1_u32_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_u64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
@@ -17578,7 +16659,7 @@ void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
   vst1_u64_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s8_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
@@ -17604,7 +16685,7 @@ void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
   vst1_s8_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s16_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
@@ -17640,7 +16721,7 @@ void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
   vst1_s16_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s32_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
@@ -17676,7 +16757,7 @@ void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
   vst1_s32_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_s64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
@@ -17712,7 +16793,7 @@ void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
   vst1_s64_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f16_x4(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f16_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
@@ -17748,7 +16829,7 @@ void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
   vst1_f16_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f32_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
@@ -17784,7 +16865,7 @@ void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
   vst1_f32_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_f64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
@@ -17820,7 +16901,7 @@ void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
   vst1_f64_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p8_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
@@ -17846,7 +16927,7 @@ void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
   vst1_p8_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p16_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
@@ -17882,7 +16963,7 @@ void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
   vst1_p16_x4(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst1_p64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
@@ -17918,7 +16999,7 @@ void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) {
   vst1_p64_x4(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vceqd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vceqd_s64(
 // CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -17926,7 +17007,7 @@ int64_t test_vceqd_s64(int64_t a, int64_t b) {
   return (int64_t)vceqd_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vceqd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vceqd_u64(
 // CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -17934,7 +17015,7 @@ uint64_t test_vceqd_u64(uint64_t a, uint64_t b) {
   return (int64_t)vceqd_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vceqzd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vceqzd_s64(
 // CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
 // CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQZ_I]]
@@ -17942,7 +17023,7 @@ int64_t test_vceqzd_s64(int64_t a) {
   return (int64_t)vceqzd_s64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vceqzd_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vceqzd_u64(
 // CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
 // CHECK:   [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQZD_I]]
@@ -17950,7 +17031,7 @@ int64_t test_vceqzd_u64(int64_t a) {
   return (int64_t)vceqzd_u64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vcged_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vcged_s64(
 // CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -17958,15 +17039,15 @@ int64_t test_vcged_s64(int64_t a, int64_t b) {
   return (int64_t)vcged_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcged_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vcged_u64(
 // CHECK:   [[TMP0:%.*]] = icmp uge i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcged_u64(uint64_t a, uint64_t b) {
-    return (uint64_t)vcged_u64(a, b);
+  return (uint64_t)vcged_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcgezd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcgezd_s64(
 // CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, 0
 // CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCGEZ_I]]
@@ -17974,7 +17055,7 @@ int64_t test_vcgezd_s64(int64_t a) {
   return (int64_t)vcgezd_s64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vcgtd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vcgtd_s64(
 // CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -17982,7 +17063,7 @@ int64_t test_vcgtd_s64(int64_t a, int64_t b) {
   return (int64_t)vcgtd_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcgtd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vcgtd_u64(
 // CHECK:   [[TMP0:%.*]] = icmp ugt i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -17990,7 +17071,7 @@ uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vcgtd_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcgtzd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcgtzd_s64(
 // CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, 0
 // CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCGTZ_I]]
@@ -17998,7 +17079,7 @@ int64_t test_vcgtzd_s64(int64_t a) {
   return (int64_t)vcgtzd_s64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vcled_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vcled_s64(
 // CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -18006,7 +17087,7 @@ int64_t test_vcled_s64(int64_t a, int64_t b) {
   return (int64_t)vcled_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcled_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vcled_u64(
 // CHECK:   [[TMP0:%.*]] = icmp ule i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -18014,7 +17095,7 @@ uint64_t test_vcled_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vcled_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vclezd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vclezd_s64(
 // CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, 0
 // CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCLEZ_I]]
@@ -18022,7 +17103,7 @@ int64_t test_vclezd_s64(int64_t a) {
   return (int64_t)vclezd_s64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vcltd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vcltd_s64(
 // CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -18030,7 +17111,7 @@ int64_t test_vcltd_s64(int64_t a, int64_t b) {
   return (int64_t)vcltd_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcltd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vcltd_u64(
 // CHECK:   [[TMP0:%.*]] = icmp ult i64 %a, %b
 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQD_I]]
@@ -18038,7 +17119,7 @@ uint64_t test_vcltd_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vcltd_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcltzd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcltzd_s64(
 // CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, 0
 // CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCLTZ_I]]
@@ -18046,7 +17127,7 @@ int64_t test_vcltzd_s64(int64_t a) {
   return (int64_t)vcltzd_s64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vtstd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vtstd_s64(
 // CHECK:   [[TMP0:%.*]] = and i64 %a, %b
 // CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
 // CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
@@ -18055,7 +17136,7 @@ int64_t test_vtstd_s64(int64_t a, int64_t b) {
   return (int64_t)vtstd_s64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vtstd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vtstd_u64(
 // CHECK:   [[TMP0:%.*]] = and i64 %a, %b
 // CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
 // CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
@@ -18064,14 +17145,14 @@ uint64_t test_vtstd_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vtstd_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vabsd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vabsd_s64(
 // CHECK:   [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a) #4
 // CHECK:   ret i64 [[VABSD_S64_I]]
 int64_t test_vabsd_s64(int64_t a) {
   return (int64_t)vabsd_s64(a);
 }
 
-// CHECK-LABEL: define i8 @test_vqabsb_s8(i8 %a) #0 {
+// CHECK-LABEL: @test_vqabsb_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0
@@ -18080,7 +17161,7 @@ int8_t test_vqabsb_s8(int8_t a) {
   return (int8_t)vqabsb_s8(a);
 }
 
-// CHECK-LABEL: define i16 @test_vqabsh_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqabsh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0
@@ -18089,28 +17170,28 @@ int16_t test_vqabsh_s16(int16_t a) {
   return (int16_t)vqabsh_s16(a);
 }
 
-// CHECK-LABEL: define i32 @test_vqabss_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqabss_s32(
 // CHECK:   [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) #4
 // CHECK:   ret i32 [[VQABSS_S32_I]]
 int32_t test_vqabss_s32(int32_t a) {
   return (int32_t)vqabss_s32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vqabsd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqabsd_s64(
 // CHECK:   [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a) #4
 // CHECK:   ret i64 [[VQABSD_S64_I]]
 int64_t test_vqabsd_s64(int64_t a) {
   return (int64_t)vqabsd_s64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vnegd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vnegd_s64(
 // CHECK:   [[VNEGD_I:%.*]] = sub i64 0, %a
 // CHECK:   ret i64 [[VNEGD_I]]
 int64_t test_vnegd_s64(int64_t a) {
   return (int64_t)vnegd_s64(a);
 }
 
-// CHECK-LABEL: define i8 @test_vqnegb_s8(i8 %a) #0 {
+// CHECK-LABEL: @test_vqnegb_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0
@@ -18119,7 +17200,7 @@ int8_t test_vqnegb_s8(int8_t a) {
   return (int8_t)vqnegb_s8(a);
 }
 
-// CHECK-LABEL: define i16 @test_vqnegh_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqnegh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0
@@ -18128,21 +17209,21 @@ int16_t test_vqnegh_s16(int16_t a) {
   return (int16_t)vqnegh_s16(a);
 }
 
-// CHECK-LABEL: define i32 @test_vqnegs_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqnegs_s32(
 // CHECK:   [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a) #4
 // CHECK:   ret i32 [[VQNEGS_S32_I]]
 int32_t test_vqnegs_s32(int32_t a) {
   return (int32_t)vqnegs_s32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vqnegd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqnegd_s64(
 // CHECK:   [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a) #4
 // CHECK:   ret i64 [[VQNEGD_S64_I]]
 int64_t test_vqnegd_s64(int64_t a) {
   return (int64_t)vqnegd_s64(a);
 }
 
-// CHECK-LABEL: define i8 @test_vuqaddb_s8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vuqaddb_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -18152,7 +17233,7 @@ int8_t test_vuqaddb_s8(int8_t a, int8_t b) {
   return (int8_t)vuqaddb_s8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vuqaddh_s16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vuqaddh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -18162,21 +17243,21 @@ int16_t test_vuqaddh_s16(int16_t a, int16_t b) {
   return (int16_t)vuqaddh_s16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vuqadds_s32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vuqadds_s32(
 // CHECK:   [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VUQADDS_S32_I]]
 int32_t test_vuqadds_s32(int32_t a, int32_t b) {
   return (int32_t)vuqadds_s32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vuqaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vuqaddd_s64(
 // CHECK:   [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VUQADDD_S64_I]]
 int64_t test_vuqaddd_s64(int64_t a, int64_t b) {
   return (int64_t)vuqaddd_s64(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vsqaddb_u8(i8 %a, i8 %b) #0 {
+// CHECK-LABEL: @test_vsqaddb_u8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
 // CHECK:   [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
@@ -18186,7 +17267,7 @@ uint8_t test_vsqaddb_u8(uint8_t a, uint8_t b) {
   return (uint8_t)vsqaddb_u8(a, b);
 }
 
-// CHECK-LABEL: define i16 @test_vsqaddh_u16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vsqaddh_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -18196,21 +17277,21 @@ uint16_t test_vsqaddh_u16(uint16_t a, uint16_t b) {
   return (uint16_t)vsqaddh_u16(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vsqadds_u32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vsqadds_u32(
 // CHECK:   [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b) #4
 // CHECK:   ret i32 [[VSQADDS_U32_I]]
 uint32_t test_vsqadds_u32(uint32_t a, uint32_t b) {
   return (uint32_t)vsqadds_u32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vsqaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vsqaddd_u64(
 // CHECK:   [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b) #4
 // CHECK:   ret i64 [[VSQADDD_U64_I]]
 uint64_t test_vsqaddd_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vsqaddd_u64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vqdmlalh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
 // CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -18218,11 +17299,10 @@ uint64_t test_vsqaddd_u64(uint64_t a, uint64_t b) {
 // CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]]) #4
 // CHECK:   ret i32 [[VQDMLXL1_I]]
 int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) {
-
   return (int32_t)vqdmlalh_s16(a, b, c);
 }
 
-// CHECK-LABEL: define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vqdmlals_s32(
 // CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
 // CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
 // CHECK:   ret i64 [[VQDMLXL1_I]]
@@ -18230,7 +17310,7 @@ int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) {
   return (int64_t)vqdmlals_s32(a, b, c);
 }
 
-// CHECK-LABEL: define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) #0 {
+// CHECK-LABEL: @test_vqdmlslh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
 // CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -18238,11 +17318,10 @@ int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) {
 // CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]]) #4
 // CHECK:   ret i32 [[VQDMLXL1_I]]
 int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) {
-
   return (int32_t)vqdmlslh_s16(a, b, c);
 }
 
-// CHECK-LABEL: define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vqdmlsls_s32(
 // CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
 // CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
 // CHECK:   ret i64 [[VQDMLXL1_I]]
@@ -18250,7 +17329,7 @@ int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) {
   return (int64_t)vqdmlsls_s32(a, b, c);
 }
 
-// CHECK-LABEL: define i32 @test_vqdmullh_s16(i16 %a, i16 %b) #0 {
+// CHECK-LABEL: @test_vqdmullh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
@@ -18260,14 +17339,14 @@ int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
   return (int32_t)vqdmullh_s16(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vqdmulls_s32(i32 %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmulls_s32(
 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b) #4
 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
   return (int64_t)vqdmulls_s32(a, b);
 }
 
-// CHECK-LABEL: define i8 @test_vqmovunh_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqmovunh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0
@@ -18276,7 +17355,7 @@ int8_t test_vqmovunh_s16(int16_t a) {
   return (int8_t)vqmovunh_s16(a);
 }
 
-// CHECK-LABEL: define i16 @test_vqmovuns_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqmovuns_s32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0
@@ -18285,14 +17364,14 @@ int16_t test_vqmovuns_s32(int32_t a) {
   return (int16_t)vqmovuns_s32(a);
 }
 
-// CHECK-LABEL: define i32 @test_vqmovund_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqmovund_s64(
 // CHECK:   [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a) #4
 // CHECK:   ret i32 [[VQMOVUND_S64_I]]
 int32_t test_vqmovund_s64(int64_t a) {
   return (int32_t)vqmovund_s64(a);
 }
 
-// CHECK-LABEL: define i8 @test_vqmovnh_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqmovnh_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0
@@ -18301,7 +17380,7 @@ int8_t test_vqmovnh_s16(int16_t a) {
   return (int8_t)vqmovnh_s16(a);
 }
 
-// CHECK-LABEL: define i16 @test_vqmovns_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqmovns_s32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0
@@ -18310,14 +17389,14 @@ int16_t test_vqmovns_s32(int32_t a) {
   return (int16_t)vqmovns_s32(a);
 }
 
-// CHECK-LABEL: define i32 @test_vqmovnd_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqmovnd_s64(
 // CHECK:   [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a) #4
 // CHECK:   ret i32 [[VQMOVND_S64_I]]
 int32_t test_vqmovnd_s64(int64_t a) {
   return (int32_t)vqmovnd_s64(a);
 }
 
-// CHECK-LABEL: define i8 @test_vqmovnh_u16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqmovnh_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0
@@ -18326,7 +17405,7 @@ int8_t test_vqmovnh_u16(int16_t a) {
   return (int8_t)vqmovnh_u16(a);
 }
 
-// CHECK-LABEL: define i16 @test_vqmovns_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqmovns_u32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) #4
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0
@@ -18335,14 +17414,14 @@ int16_t test_vqmovns_u32(int32_t a) {
   return (int16_t)vqmovns_u32(a);
 }
 
-// CHECK-LABEL: define i32 @test_vqmovnd_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqmovnd_u64(
 // CHECK:   [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a) #4
 // CHECK:   ret i32 [[VQMOVND_U64_I]]
 int32_t test_vqmovnd_u64(int64_t a) {
   return (int32_t)vqmovnd_u64(a);
 }
 
-// CHECK-LABEL: define i32 @test_vceqs_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vceqs_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCMPD_I]]
@@ -18350,7 +17429,7 @@ uint32_t test_vceqs_f32(float32_t a, float32_t b) {
   return (uint32_t)vceqs_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vceqd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vceqd_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCMPD_I]]
@@ -18358,7 +17437,7 @@ uint64_t test_vceqd_f64(float64_t a, float64_t b) {
   return (uint64_t)vceqd_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vceqzs_f32(float %a) #0 {
+// CHECK-LABEL: @test_vceqzs_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00
 // CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCEQZ_I]]
@@ -18366,7 +17445,7 @@ uint32_t test_vceqzs_f32(float32_t a) {
   return (uint32_t)vceqzs_f32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vceqzd_f64(double %a) #0 {
+// CHECK-LABEL: @test_vceqzd_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00
 // CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCEQZ_I]]
@@ -18374,7 +17453,7 @@ uint64_t test_vceqzd_f64(float64_t a) {
   return (uint64_t)vceqzd_f64(a);
 }
 
-// CHECK-LABEL: define i32 @test_vcges_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vcges_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp oge float %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCMPD_I]]
@@ -18382,7 +17461,7 @@ uint32_t test_vcges_f32(float32_t a, float32_t b) {
   return (uint32_t)vcges_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcged_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vcged_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp oge double %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCMPD_I]]
@@ -18390,7 +17469,7 @@ uint64_t test_vcged_f64(float64_t a, float64_t b) {
   return (uint64_t)vcged_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vcgezs_f32(float %a) #0 {
+// CHECK-LABEL: @test_vcgezs_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00
 // CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCGEZ_I]]
@@ -18398,7 +17477,7 @@ uint32_t test_vcgezs_f32(float32_t a) {
   return (uint32_t)vcgezs_f32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vcgezd_f64(double %a) #0 {
+// CHECK-LABEL: @test_vcgezd_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00
 // CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCGEZ_I]]
@@ -18406,7 +17485,7 @@ uint64_t test_vcgezd_f64(float64_t a) {
   return (uint64_t)vcgezd_f64(a);
 }
 
-// CHECK-LABEL: define i32 @test_vcgts_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vcgts_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCMPD_I]]
@@ -18414,7 +17493,7 @@ uint32_t test_vcgts_f32(float32_t a, float32_t b) {
   return (uint32_t)vcgts_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcgtd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vcgtd_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCMPD_I]]
@@ -18422,7 +17501,7 @@ uint64_t test_vcgtd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcgtd_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vcgtzs_f32(float %a) #0 {
+// CHECK-LABEL: @test_vcgtzs_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00
 // CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCGTZ_I]]
@@ -18430,7 +17509,7 @@ uint32_t test_vcgtzs_f32(float32_t a) {
   return (uint32_t)vcgtzs_f32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vcgtzd_f64(double %a) #0 {
+// CHECK-LABEL: @test_vcgtzd_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00
 // CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCGTZ_I]]
@@ -18438,7 +17517,7 @@ uint64_t test_vcgtzd_f64(float64_t a) {
   return (uint64_t)vcgtzd_f64(a);
 }
 
-// CHECK-LABEL: define i32 @test_vcles_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vcles_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp ole float %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCMPD_I]]
@@ -18446,7 +17525,7 @@ uint32_t test_vcles_f32(float32_t a, float32_t b) {
   return (uint32_t)vcles_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcled_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vcled_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp ole double %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCMPD_I]]
@@ -18454,7 +17533,7 @@ uint64_t test_vcled_f64(float64_t a, float64_t b) {
   return (uint64_t)vcled_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vclezs_f32(float %a) #0 {
+// CHECK-LABEL: @test_vclezs_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00
 // CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCLEZ_I]]
@@ -18462,7 +17541,7 @@ uint32_t test_vclezs_f32(float32_t a) {
   return (uint32_t)vclezs_f32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vclezd_f64(double %a) #0 {
+// CHECK-LABEL: @test_vclezd_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00
 // CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCLEZ_I]]
@@ -18470,7 +17549,7 @@ uint64_t test_vclezd_f64(float64_t a) {
   return (uint64_t)vclezd_f64(a);
 }
 
-// CHECK-LABEL: define i32 @test_vclts_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vclts_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp olt float %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCMPD_I]]
@@ -18478,7 +17557,7 @@ uint32_t test_vclts_f32(float32_t a, float32_t b) {
   return (uint32_t)vclts_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcltd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vcltd_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp olt double %a, %b
 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCMPD_I]]
@@ -18486,7 +17565,7 @@ uint64_t test_vcltd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcltd_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vcltzs_f32(float %a) #0 {
+// CHECK-LABEL: @test_vcltzs_f32(
 // CHECK:   [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00
 // CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
 // CHECK:   ret i32 [[VCLTZ_I]]
@@ -18494,7 +17573,7 @@ uint32_t test_vcltzs_f32(float32_t a) {
   return (uint32_t)vcltzs_f32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vcltzd_f64(double %a) #0 {
+// CHECK-LABEL: @test_vcltzd_f64(
 // CHECK:   [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00
 // CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
 // CHECK:   ret i64 [[VCLTZ_I]]
@@ -18502,70 +17581,70 @@ uint64_t test_vcltzd_f64(float64_t a) {
   return (uint64_t)vcltzd_f64(a);
 }
 
-// CHECK-LABEL: define i32 @test_vcages_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vcages_f32(
 // CHECK:   [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b) #4
 // CHECK:   ret i32 [[VCAGES_F32_I]]
 uint32_t test_vcages_f32(float32_t a, float32_t b) {
   return (uint32_t)vcages_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcaged_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vcaged_f64(
 // CHECK:   [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b) #4
 // CHECK:   ret i64 [[VCAGED_F64_I]]
 uint64_t test_vcaged_f64(float64_t a, float64_t b) {
   return (uint64_t)vcaged_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vcagts_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vcagts_f32(
 // CHECK:   [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b) #4
 // CHECK:   ret i32 [[VCAGTS_F32_I]]
 uint32_t test_vcagts_f32(float32_t a, float32_t b) {
   return (uint32_t)vcagts_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcagtd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vcagtd_f64(
 // CHECK:   [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b) #4
 // CHECK:   ret i64 [[VCAGTD_F64_I]]
 uint64_t test_vcagtd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcagtd_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vcales_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vcales_f32(
 // CHECK:   [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a) #4
 // CHECK:   ret i32 [[VCALES_F32_I]]
 uint32_t test_vcales_f32(float32_t a, float32_t b) {
   return (uint32_t)vcales_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcaled_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vcaled_f64(
 // CHECK:   [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a) #4
 // CHECK:   ret i64 [[VCALED_F64_I]]
 uint64_t test_vcaled_f64(float64_t a, float64_t b) {
   return (uint64_t)vcaled_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vcalts_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vcalts_f32(
 // CHECK:   [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a) #4
 // CHECK:   ret i32 [[VCALTS_F32_I]]
 uint32_t test_vcalts_f32(float32_t a, float32_t b) {
   return (uint32_t)vcalts_f32(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vcaltd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vcaltd_f64(
 // CHECK:   [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a) #4
 // CHECK:   ret i64 [[VCALTD_F64_I]]
 uint64_t test_vcaltd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcaltd_f64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vshrd_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vshrd_n_s64(
 // CHECK:   [[SHRD_N:%.*]] = ashr i64 %a, 1
 // CHECK:   ret i64 [[SHRD_N]]
 int64_t test_vshrd_n_s64(int64_t a) {
   return (int64_t)vshrd_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
@@ -18574,22 +17653,20 @@ int64x1_t test_vshr_n_s64(int64x1_t a) {
   return vshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vshrd_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vshrd_n_u64(
 // CHECK:   ret i64 0
 uint64_t test_vshrd_n_u64(uint64_t a) {
-
   return (uint64_t)vshrd_n_u64(a, 64);
 }
 
-// CHECK-LABEL: define i64 @test_vshrd_n_u64_2() #0 {
+// CHECK-LABEL: @test_vshrd_n_u64_2(
 // CHECK:   ret i64 0
 uint64_t test_vshrd_n_u64_2() {
-
   uint64_t a = UINT64_C(0xf000000000000000);
   return vshrd_n_u64(a, 64);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
@@ -18598,14 +17675,14 @@ uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   return vshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vrshrd_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vrshrd_n_s64(
 // CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63)
 // CHECK:   ret i64 [[VRSHR_N]]
 int64_t test_vrshrd_n_s64(int64_t a) {
   return (int64_t)vrshrd_n_s64(a, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
@@ -18614,14 +17691,14 @@ int64x1_t test_vrshr_n_s64(int64x1_t a) {
   return vrshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vrshrd_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vrshrd_n_u64(
 // CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63)
 // CHECK:   ret i64 [[VRSHR_N]]
 uint64_t test_vrshrd_n_u64(uint64_t a) {
   return (uint64_t)vrshrd_n_u64(a, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
@@ -18630,7 +17707,7 @@ uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   return vrshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vsrad_n_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vsrad_n_s64(
 // CHECK:   [[SHRD_N:%.*]] = ashr i64 %b, 63
 // CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
 // CHECK:   ret i64 [[TMP0]]
@@ -18638,7 +17715,7 @@ int64_t test_vsrad_n_s64(int64_t a, int64_t b) {
   return (int64_t)vsrad_n_s64(a, b, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -18650,7 +17727,7 @@ int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   return vsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vsrad_n_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vsrad_n_u64(
 // CHECK:   [[SHRD_N:%.*]] = lshr i64 %b, 63
 // CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
 // CHECK:   ret i64 [[TMP0]]
@@ -18658,14 +17735,13 @@ uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vsrad_n_u64(a, b, 63);
 }
 
-// CHECK-LABEL: define i64 @test_vsrad_n_u64_2(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vsrad_n_u64_2(
 // CHECK:   ret i64 %a
 uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) {
-
   return (uint64_t)vsrad_n_u64(a, b, 64);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -18677,7 +17753,7 @@ uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vrsrad_n_s64(
 // CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63)
 // CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
 // CHECK:   ret i64 [[TMP1]]
@@ -18685,7 +17761,7 @@ int64_t test_vrsrad_n_s64(int64_t a, int64_t b) {
   return (int64_t)vrsrad_n_s64(a, b, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
@@ -18697,7 +17773,7 @@ int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   return vrsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vrsrad_n_u64(
 // CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63)
 // CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
 // CHECK:   ret i64 [[TMP1]]
@@ -18705,7 +17781,7 @@ uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vrsrad_n_u64(a, b, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
@@ -18717,13 +17793,14 @@ uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vrsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vshld_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vshld_n_s64(
 // CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 1
 // CHECK:   ret i64 [[SHLD_N]]
 int64_t test_vshld_n_s64(int64_t a) {
   return (int64_t)vshld_n_s64(a, 1);
 }
-// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
+
+// CHECK-LABEL: @test_vshl_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
@@ -18732,14 +17809,14 @@ int64x1_t test_vshl_n_s64(int64x1_t a) {
   return vshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vshld_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vshld_n_u64(
 // CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 63
 // CHECK:   ret i64 [[SHLD_N]]
 uint64_t test_vshld_n_u64(uint64_t a) {
   return (uint64_t)vshld_n_u64(a, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
@@ -18748,7 +17825,7 @@ uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   return vshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: define i8 @test_vqshlb_n_s8(i8 %a) #0 {
+// CHECK-LABEL: @test_vqshlb_n_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0
@@ -18757,7 +17834,7 @@ int8_t test_vqshlb_n_s8(int8_t a) {
   return (int8_t)vqshlb_n_s8(a, 7);
 }
 
-// CHECK-LABEL: define i16 @test_vqshlh_n_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqshlh_n_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0
@@ -18766,35 +17843,35 @@ int16_t test_vqshlh_n_s16(int16_t a) {
   return (int16_t)vqshlh_n_s16(a, 15);
 }
 
-// CHECK-LABEL: define i32 @test_vqshls_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqshls_n_s32(
 // CHECK:   [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31)
 // CHECK:   ret i32 [[VQSHLS_N_S32]]
 int32_t test_vqshls_n_s32(int32_t a) {
   return (int32_t)vqshls_n_s32(a, 31);
 }
 
-// CHECK-LABEL: define i64 @test_vqshld_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqshld_n_s64(
 // CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63)
 // CHECK:   ret i64 [[VQSHL_N]]
 int64_t test_vqshld_n_s64(int64_t a) {
   return (int64_t)vqshld_n_s64(a, 63);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_s8(
 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
 // CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   return vqshl_n_s8(a, 0);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_s8(
 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
 // CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   return vqshlq_n_s8(a, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
@@ -18803,7 +17880,7 @@ int16x4_t test_vqshl_n_s16(int16x4_t a) {
   return vqshl_n_s16(a, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
@@ -18812,7 +17889,7 @@ int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   return vqshlq_n_s16(a, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
@@ -18821,7 +17898,7 @@ int32x2_t test_vqshl_n_s32(int32x2_t a) {
   return vqshl_n_s32(a, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
@@ -18830,7 +17907,7 @@ int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   return vqshlq_n_s32(a, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
@@ -18839,21 +17916,21 @@ int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   return vqshlq_n_s64(a, 0);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_u8(
 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
 // CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   return vqshl_n_u8(a, 0);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_u8(
 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
 // CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   return vqshlq_n_u8(a, 0);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
@@ -18862,7 +17939,7 @@ uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   return vqshl_n_u16(a, 0);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
@@ -18871,7 +17948,7 @@ uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   return vqshlq_n_u16(a, 0);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
@@ -18880,7 +17957,7 @@ uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   return vqshl_n_u32(a, 0);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
@@ -18889,7 +17966,7 @@ uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   return vqshlq_n_u32(a, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
@@ -18898,7 +17975,7 @@ uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   return vqshlq_n_u64(a, 0);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
@@ -18907,7 +17984,7 @@ int64x1_t test_vqshl_n_s64(int64x1_t a) {
   return vqshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define i8 @test_vqshlb_n_u8(i8 %a) #0 {
+// CHECK-LABEL: @test_vqshlb_n_u8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0
@@ -18916,7 +17993,7 @@ uint8_t test_vqshlb_n_u8(uint8_t a) {
   return (uint8_t)vqshlb_n_u8(a, 7);
 }
 
-// CHECK-LABEL: define i16 @test_vqshlh_n_u16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqshlh_n_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0
@@ -18925,21 +18002,21 @@ uint16_t test_vqshlh_n_u16(uint16_t a) {
   return (uint16_t)vqshlh_n_u16(a, 15);
 }
 
-// CHECK-LABEL: define i32 @test_vqshls_n_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqshls_n_u32(
 // CHECK:   [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31)
 // CHECK:   ret i32 [[VQSHLS_N_U32]]
 uint32_t test_vqshls_n_u32(uint32_t a) {
   return (uint32_t)vqshls_n_u32(a, 31);
 }
 
-// CHECK-LABEL: define i64 @test_vqshld_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqshld_n_u64(
 // CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63)
 // CHECK:   ret i64 [[VQSHL_N]]
 uint64_t test_vqshld_n_u64(uint64_t a) {
   return (uint64_t)vqshld_n_u64(a, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
@@ -18948,7 +18025,7 @@ uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   return vqshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: define i8 @test_vqshlub_n_s8(i8 %a) #0 {
+// CHECK-LABEL: @test_vqshlub_n_s8(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
 // CHECK:   [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0
@@ -18957,7 +18034,7 @@ int8_t test_vqshlub_n_s8(int8_t a) {
   return (int8_t)vqshlub_n_s8(a, 7);
 }
 
-// CHECK-LABEL: define i16 @test_vqshluh_n_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqshluh_n_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0
@@ -18966,21 +18043,21 @@ int16_t test_vqshluh_n_s16(int16_t a) {
   return (int16_t)vqshluh_n_s16(a, 15);
 }
 
-// CHECK-LABEL: define i32 @test_vqshlus_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqshlus_n_s32(
 // CHECK:   [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31)
 // CHECK:   ret i32 [[VQSHLUS_N_S32]]
 int32_t test_vqshlus_n_s32(int32_t a) {
   return (int32_t)vqshlus_n_s32(a, 31);
 }
 
-// CHECK-LABEL: define i64 @test_vqshlud_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqshlud_n_s64(
 // CHECK:   [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63)
 // CHECK:   ret i64 [[VQSHLU_N]]
 int64_t test_vqshlud_n_s64(int64_t a) {
   return (int64_t)vqshlud_n_s64(a, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshlu_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
@@ -18989,7 +18066,7 @@ uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   return vqshlu_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vsrid_n_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vsrid_n_s64(
 // CHECK:   [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
 // CHECK:   [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
 // CHECK:   [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63)
@@ -18999,7 +18076,7 @@ int64_t test_vsrid_n_s64(int64_t a, int64_t b) {
   return (int64_t)vsrid_n_s64(a, b, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -19010,7 +18087,7 @@ int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   return vsri_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vsrid_n_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vsrid_n_u64(
 // CHECK:   [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
 // CHECK:   [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
 // CHECK:   [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63)
@@ -19020,7 +18097,7 @@ uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vsrid_n_u64(a, b, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -19031,7 +18108,7 @@ uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsri_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vslid_n_s64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vslid_n_s64(
 // CHECK:   [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
 // CHECK:   [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
 // CHECK:   [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63)
@@ -19041,7 +18118,7 @@ int64_t test_vslid_n_s64(int64_t a, int64_t b) {
   return (int64_t)vslid_n_s64(a, b, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -19052,7 +18129,7 @@ int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   return vsli_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vslid_n_u64(i64 %a, i64 %b) #0 {
+// CHECK-LABEL: @test_vslid_n_u64(
 // CHECK:   [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
 // CHECK:   [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
 // CHECK:   [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63)
@@ -19062,7 +18139,7 @@ uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vslid_n_u64(a, b, 63);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -19073,7 +18150,7 @@ uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsli_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define i8 @test_vqshrnh_n_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqshrnh_n_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0
@@ -19082,7 +18159,7 @@ int8_t test_vqshrnh_n_s16(int16_t a) {
   return (int8_t)vqshrnh_n_s16(a, 8);
 }
 
-// CHECK-LABEL: define i16 @test_vqshrns_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqshrns_n_s32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0
@@ -19091,14 +18168,14 @@ int16_t test_vqshrns_n_s32(int32_t a) {
   return (int16_t)vqshrns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: define i32 @test_vqshrnd_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqshrnd_n_s64(
 // CHECK:   [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32)
 // CHECK:   ret i32 [[VQSHRND_N_S64]]
 int32_t test_vqshrnd_n_s64(int64_t a) {
   return (int32_t)vqshrnd_n_s64(a, 32);
 }
 
-// CHECK-LABEL: define i8 @test_vqshrnh_n_u16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqshrnh_n_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0
@@ -19107,7 +18184,7 @@ uint8_t test_vqshrnh_n_u16(uint16_t a) {
   return (uint8_t)vqshrnh_n_u16(a, 8);
 }
 
-// CHECK-LABEL: define i16 @test_vqshrns_n_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqshrns_n_u32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0
@@ -19116,14 +18193,14 @@ uint16_t test_vqshrns_n_u32(uint32_t a) {
   return (uint16_t)vqshrns_n_u32(a, 16);
 }
 
-// CHECK-LABEL: define i32 @test_vqshrnd_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqshrnd_n_u64(
 // CHECK:   [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32)
 // CHECK:   ret i32 [[VQSHRND_N_U64]]
 uint32_t test_vqshrnd_n_u64(uint64_t a) {
   return (uint32_t)vqshrnd_n_u64(a, 32);
 }
 
-// CHECK-LABEL: define i8 @test_vqrshrnh_n_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqrshrnh_n_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0
@@ -19132,7 +18209,7 @@ int8_t test_vqrshrnh_n_s16(int16_t a) {
   return (int8_t)vqrshrnh_n_s16(a, 8);
 }
 
-// CHECK-LABEL: define i16 @test_vqrshrns_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqrshrns_n_s32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0
@@ -19141,14 +18218,14 @@ int16_t test_vqrshrns_n_s32(int32_t a) {
   return (int16_t)vqrshrns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: define i32 @test_vqrshrnd_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqrshrnd_n_s64(
 // CHECK:   [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32)
 // CHECK:   ret i32 [[VQRSHRND_N_S64]]
 int32_t test_vqrshrnd_n_s64(int64_t a) {
   return (int32_t)vqrshrnd_n_s64(a, 32);
 }
 
-// CHECK-LABEL: define i8 @test_vqrshrnh_n_u16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqrshrnh_n_u16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0
@@ -19157,7 +18234,7 @@ uint8_t test_vqrshrnh_n_u16(uint16_t a) {
   return (uint8_t)vqrshrnh_n_u16(a, 8);
 }
 
-// CHECK-LABEL: define i16 @test_vqrshrns_n_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqrshrns_n_u32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0
@@ -19166,14 +18243,14 @@ uint16_t test_vqrshrns_n_u32(uint32_t a) {
   return (uint16_t)vqrshrns_n_u32(a, 16);
 }
 
-// CHECK-LABEL: define i32 @test_vqrshrnd_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqrshrnd_n_u64(
 // CHECK:   [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32)
 // CHECK:   ret i32 [[VQRSHRND_N_U64]]
 uint32_t test_vqrshrnd_n_u64(uint64_t a) {
   return (uint32_t)vqrshrnd_n_u64(a, 32);
 }
 
-// CHECK-LABEL: define i8 @test_vqshrunh_n_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqshrunh_n_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0
@@ -19182,7 +18259,7 @@ int8_t test_vqshrunh_n_s16(int16_t a) {
   return (int8_t)vqshrunh_n_s16(a, 8);
 }
 
-// CHECK-LABEL: define i16 @test_vqshruns_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqshruns_n_s32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0
@@ -19191,14 +18268,14 @@ int16_t test_vqshruns_n_s32(int32_t a) {
   return (int16_t)vqshruns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: define i32 @test_vqshrund_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqshrund_n_s64(
 // CHECK:   [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32)
 // CHECK:   ret i32 [[VQSHRUND_N_S64]]
 int32_t test_vqshrund_n_s64(int64_t a) {
   return (int32_t)vqshrund_n_s64(a, 32);
 }
 
-// CHECK-LABEL: define i8 @test_vqrshrunh_n_s16(i16 %a) #0 {
+// CHECK-LABEL: @test_vqrshrunh_n_s16(
 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
 // CHECK:   [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0
@@ -19207,7 +18284,7 @@ int8_t test_vqrshrunh_n_s16(int16_t a) {
   return (int8_t)vqrshrunh_n_s16(a, 8);
 }
 
-// CHECK-LABEL: define i16 @test_vqrshruns_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vqrshruns_n_s32(
 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
 // CHECK:   [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0
@@ -19216,2894 +18293,2855 @@ int16_t test_vqrshruns_n_s32(int32_t a) {
   return (int16_t)vqrshruns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: define i32 @test_vqrshrund_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vqrshrund_n_s64(
 // CHECK:   [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32)
 // CHECK:   ret i32 [[VQRSHRUND_N_S64]]
 int32_t test_vqrshrund_n_s64(int64_t a) {
   return (int32_t)vqrshrund_n_s64(a, 32);
 }
 
-// CHECK-LABEL: define float @test_vcvts_n_f32_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vcvts_n_f32_s32(
 // CHECK:   [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1)
 // CHECK:   ret float [[VCVTS_N_F32_S32]]
 float32_t test_vcvts_n_f32_s32(int32_t a) {
   return vcvts_n_f32_s32(a, 1);
 }
 
-// CHECK-LABEL: define double @test_vcvtd_n_f64_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcvtd_n_f64_s64(
 // CHECK:   [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1)
 // CHECK:   ret double [[VCVTD_N_F64_S64]]
 float64_t test_vcvtd_n_f64_s64(int64_t a) {
   return vcvtd_n_f64_s64(a, 1);
 }
 
-// CHECK-LABEL: define float @test_vcvts_n_f32_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vcvts_n_f32_u32(
 // CHECK:   [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32)
 // CHECK:   ret float [[VCVTS_N_F32_U32]]
 float32_t test_vcvts_n_f32_u32(uint32_t a) {
   return vcvts_n_f32_u32(a, 32);
 }
 
-// CHECK-LABEL: define double @test_vcvtd_n_f64_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcvtd_n_f64_u64(
 // CHECK:   [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64)
 // CHECK:   ret double [[VCVTD_N_F64_U64]]
 float64_t test_vcvtd_n_f64_u64(uint64_t a) {
   return vcvtd_n_f64_u64(a, 64);
 }
 
-// CHECK-LABEL: define i32 @test_vcvts_n_s32_f32(float %a) #0 {
+// CHECK-LABEL: @test_vcvts_n_s32_f32(
 // CHECK:   [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1)
 // CHECK:   ret i32 [[VCVTS_N_S32_F32]]
 int32_t test_vcvts_n_s32_f32(float32_t a) {
   return (int32_t)vcvts_n_s32_f32(a, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vcvtd_n_s64_f64(double %a) #0 {
+// CHECK-LABEL: @test_vcvtd_n_s64_f64(
 // CHECK:   [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1)
 // CHECK:   ret i64 [[VCVTD_N_S64_F64]]
 int64_t test_vcvtd_n_s64_f64(float64_t a) {
   return (int64_t)vcvtd_n_s64_f64(a, 1);
 }
 
-// CHECK-LABEL: define i32 @test_vcvts_n_u32_f32(float %a) #0 {
+// CHECK-LABEL: @test_vcvts_n_u32_f32(
 // CHECK:   [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32)
 // CHECK:   ret i32 [[VCVTS_N_U32_F32]]
 uint32_t test_vcvts_n_u32_f32(float32_t a) {
   return (uint32_t)vcvts_n_u32_f32(a, 32);
 }
 
-// CHECK-LABEL: define i64 @test_vcvtd_n_u64_f64(double %a) #0 {
+// CHECK-LABEL: @test_vcvtd_n_u64_f64(
 // CHECK:   [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64)
 // CHECK:   ret i64 [[VCVTD_N_U64_F64]]
 uint64_t test_vcvtd_n_u64_f64(float64_t a) {
   return (uint64_t)vcvtd_n_u64_f64(a, 64);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_u8(
 // CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f64(float64x1_t a) {
   return vreinterpret_s8_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_p8(
 // CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) {
   return vreinterpret_s8_p64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_u16(
 // CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f64(float64x1_t a) {
   return vreinterpret_s16_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_p16(
 // CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) {
   return vreinterpret_s16_p64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_u32(
 // CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f64(float64x1_t a) {
   return vreinterpret_s32_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) {
   return vreinterpret_s32_p64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_u64(
 // CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f64(float64x1_t a) {
   return vreinterpret_s64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_p64(
 // CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) {
   return vreinterpret_s64_p64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_s8(
 // CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) {
   return vreinterpret_u8_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_p8(
 // CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) {
   return vreinterpret_u8_p64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_s16(
 // CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) {
   return vreinterpret_u16_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_p16(
 // CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) {
   return vreinterpret_u16_p64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_s32(
 // CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) {
   return vreinterpret_u32_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) {
   return vreinterpret_u32_p64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_s64(
 // CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) {
   return vreinterpret_u64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_p64(
 // CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) {
   return vreinterpret_u64_p64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f64(float64x1_t a) {
   return vreinterpret_f16_f64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) {
   return vreinterpret_f16_p64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f64(float64x1_t a) {
   return vreinterpret_f32_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) {
   return vreinterpret_f32_p64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s8(int8x8_t a) {
   return vreinterpret_f64_s8(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s16(int16x4_t a) {
   return vreinterpret_f64_s16(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s32(int32x2_t a) {
   return vreinterpret_f64_s32(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s64(int64x1_t a) {
   return vreinterpret_f64_s64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) {
   return vreinterpret_f64_u8(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) {
   return vreinterpret_f64_u16(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) {
   return vreinterpret_f64_u32(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) {
   return vreinterpret_f64_u64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_f16(float16x4_t a) {
   return vreinterpret_f64_f16(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_f32(float32x2_t a) {
   return vreinterpret_f64_f32(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) {
   return vreinterpret_f64_p8(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) {
   return vreinterpret_f64_p16(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f64_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
 // CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) {
   return vreinterpret_f64_p64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_s8(
 // CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_u8(
 // CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) {
   return vreinterpret_p8_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) {
   return vreinterpret_p8_p64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_s16(
 // CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_u16(
 // CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) {
   return vreinterpret_p16_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) {
   return vreinterpret_p16_p64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) {
   return vreinterpret_p64_s8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) {
   return vreinterpret_p64_s16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) {
   return vreinterpret_p64_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_s64(
 // CHECK:   ret <1 x i64> %a
 poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) {
   return vreinterpret_p64_s64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) {
   return vreinterpret_p64_u8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) {
   return vreinterpret_p64_u16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) {
   return vreinterpret_p64_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_u64(
 // CHECK:   ret <1 x i64> %a
 poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) {
   return vreinterpret_p64_u64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) {
   return vreinterpret_p64_f16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) {
   return vreinterpret_p64_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) {
   return vreinterpret_p64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) {
   return vreinterpret_p64_p8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) {
   return vreinterpret_p64_p16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_u8(
 // CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) {
   return vreinterpretq_s8_f64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_p8(
 // CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) {
   return vreinterpretq_s8_p64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_u16(
 // CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) {
   return vreinterpretq_s16_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_p16(
 // CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) {
   return vreinterpretq_s16_p64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_u32(
 // CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) {
   return vreinterpretq_s32_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) {
   return vreinterpretq_s32_p64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_u64(
 // CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) {
   return vreinterpretq_s64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_p64(
 // CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) {
   return vreinterpretq_s64_p64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_s8(
 // CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) {
   return vreinterpretq_u8_f64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_p8(
 // CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) {
   return vreinterpretq_u8_p64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_s16(
 // CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) {
   return vreinterpretq_u16_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_p16(
 // CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) {
   return vreinterpretq_u16_p64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_s32(
 // CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) {
   return vreinterpretq_u32_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) {
   return vreinterpretq_u32_p64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_s64(
 // CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) {
   return vreinterpretq_u64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_p64(
 // CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) {
   return vreinterpretq_u64_p64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) {
   return vreinterpretq_f16_f64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) {
   return vreinterpretq_f16_p64(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) {
   return vreinterpretq_f32_f64(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) {
   return vreinterpretq_f32_p64(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) {
   return vreinterpretq_f64_s8(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) {
   return vreinterpretq_f64_s16(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) {
   return vreinterpretq_f64_s32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) {
   return vreinterpretq_f64_s64(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) {
   return vreinterpretq_f64_u8(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) {
   return vreinterpretq_f64_u16(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) {
   return vreinterpretq_f64_u32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) {
   return vreinterpretq_f64_u64(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) {
   return vreinterpretq_f64_f16(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) {
   return vreinterpretq_f64_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) {
   return vreinterpretq_f64_p8(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) {
   return vreinterpretq_f64_p16(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f64_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
 // CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) {
   return vreinterpretq_f64_p64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_s8(
 // CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_u8(
 // CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) {
   return vreinterpretq_p8_f64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) {
   return vreinterpretq_p8_p64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_s16(
 // CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_u16(
 // CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) {
   return vreinterpretq_p16_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) {
   return vreinterpretq_p16_p64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) {
   return vreinterpretq_p64_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) {
   return vreinterpretq_p64_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) {
   return vreinterpretq_p64_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_s64(
 // CHECK:   ret <2 x i64> %a
 poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) {
   return vreinterpretq_p64_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) {
   return vreinterpretq_p64_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) {
   return vreinterpretq_p64_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) {
   return vreinterpretq_p64_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_u64(
 // CHECK:   ret <2 x i64> %a
 poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) {
   return vreinterpretq_p64_u64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) {
   return vreinterpretq_p64_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) {
   return vreinterpretq_p64_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) {
   return vreinterpretq_p64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) {
   return vreinterpretq_p64_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) {
   return vreinterpretq_p64_p16(a);
 }
 
-// CHECK-LABEL: define float @test_vabds_f32(float %a, float %b) #0 {
+// CHECK-LABEL: @test_vabds_f32(
 // CHECK:   [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) #4
 // CHECK:   ret float [[VABDS_F32_I]]
 float32_t test_vabds_f32(float32_t a, float32_t b) {
   return vabds_f32(a, b);
 }
 
-// CHECK-LABEL: define double @test_vabdd_f64(double %a, double %b) #0 {
+// CHECK-LABEL: @test_vabdd_f64(
 // CHECK:   [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) #4
 // CHECK:   ret double [[VABDD_F64_I]]
 float64_t test_vabdd_f64(float64_t a, float64_t b) {
   return vabdd_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vuqadd_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> [[VUQADD_I]], <1 x i64> [[VUQADD1_I]]) #4
+// CHECK:   [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   ret <1 x i64> [[VUQADD2_I]]
 int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) {
   return vuqadd_s64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsqadd_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> [[VSQADD_I]], <1 x i64> [[VSQADD1_I]]) #4
+// CHECK:   [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   ret <1 x i64> [[VSQADD2_I]]
 uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) {
   return vsqadd_u64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsqadd_u8(
 // CHECK:   [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VSQADD_I]]
 uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) {
   return vsqadd_u8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsqaddq_u8(
 // CHECK:   [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VSQADD_I]]
 uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) {
   return vsqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsqadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[VSQADD_I]], <4 x i16> [[VSQADD1_I]]) #4
+// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i16> [[VSQADD2_I]]
 uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) {
   return vsqadd_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsqaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> [[VSQADD_I]], <8 x i16> [[VSQADD1_I]]) #4
+// CHECK:   [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i16> [[VSQADD2_I]]
 uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) {
   return vsqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsqadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> [[VSQADD_I]], <2 x i32> [[VSQADD1_I]]) #4
+// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i32> [[VSQADD2_I]]
 uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) {
   return vsqadd_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsqaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> [[VSQADD_I]], <4 x i32> [[VSQADD1_I]]) #4
+// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   ret <4 x i32> [[VSQADD2_I]]
 uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) {
   return vsqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsqaddq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> [[VSQADD_I]], <2 x i64> [[VSQADD1_I]]) #4
+// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   ret <2 x i64> [[VSQADD2_I]]
 uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) {
   return vsqaddq_u64(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vabs_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vabs_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> [[VABS_I]]) #4
+// CHECK:   [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %a) #4
 // CHECK:   ret <1 x i64> [[VABS1_I]]
 int64x1_t test_vabs_s64(int64x1_t a) {
   return vabs_s64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqabs_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqabs_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> %a) #4
 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP1]]
+// CHECK:   ret <1 x i64> [[VQABS_V1_I]]
 int64x1_t test_vqabs_s64(int64x1_t a) {
   return vqabs_s64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqneg_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqneg_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> %a) #4
 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP1]]
+// CHECK:   ret <1 x i64> [[VQNEG_V1_I]]
 int64x1_t test_vqneg_s64(int64x1_t a) {
   return vqneg_s64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vneg_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vneg_s64(
 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a
 // CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vneg_s64(int64x1_t a) {
   return vneg_s64(a);
 }
 
-// CHECK-LABEL: define float @test_vaddv_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vaddv_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VADDV_F32_I]]
 float32_t test_vaddv_f32(float32x2_t a) {
   return vaddv_f32(a);
 }
 
-// CHECK-LABEL: define float @test_vaddvq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vaddvq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[TMP1]]) #4
+// CHECK:   [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a) #4
 // CHECK:   ret float [[VADDVQ_F32_I]]
 float32_t test_vaddvq_f32(float32x4_t a) {
   return vaddvq_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vaddvq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vaddvq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VADDVQ_F64_I]]
 float64_t test_vaddvq_f64(float64x2_t a) {
   return vaddvq_f64(a);
 }
 
-// CHECK-LABEL: define float @test_vmaxv_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vmaxv_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VMAXV_F32_I]]
 float32_t test_vmaxv_f32(float32x2_t a) {
   return vmaxv_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vmaxvq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vmaxvq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VMAXVQ_F64_I]]
 float64_t test_vmaxvq_f64(float64x2_t a) {
   return vmaxvq_f64(a);
 }
 
-// CHECK-LABEL: define float @test_vminv_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vminv_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VMINV_F32_I]]
 float32_t test_vminv_f32(float32x2_t a) {
   return vminv_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vminvq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vminvq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VMINVQ_F64_I]]
 float64_t test_vminvq_f64(float64x2_t a) {
   return vminvq_f64(a);
 }
 
-// CHECK-LABEL: define double @test_vmaxnmvq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vmaxnmvq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VMAXNMVQ_F64_I]]
 float64_t test_vmaxnmvq_f64(float64x2_t a) {
   return vmaxnmvq_f64(a);
 }
 
-// CHECK-LABEL: define float @test_vmaxnmv_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vmaxnmv_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VMAXNMV_F32_I]]
 float32_t test_vmaxnmv_f32(float32x2_t a) {
   return vmaxnmv_f32(a);
 }
 
-// CHECK-LABEL: define double @test_vminnmvq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vminnmvq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a) #4
 // CHECK:   ret double [[VMINNMVQ_F64_I]]
 float64_t test_vminnmvq_f64(float64x2_t a) {
   return vminnmvq_f64(a);
 }
 
-// CHECK-LABEL: define float @test_vminnmv_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vminnmv_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a) #4
 // CHECK:   ret float [[VMINNMV_F32_I]]
 float32_t test_vminnmv_f32(float32x2_t a) {
   return vminnmv_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VPADDQ_V2_I]]
 int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) {
   return vpaddq_s64(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vpaddq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VPADDQ_V2_I]]
 uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vpaddq_u64(a, b);
 }
 
-// CHECK-LABEL: define i64 @test_vpaddd_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vpaddd_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) #4
 // CHECK:   ret i64 [[VPADDD_U64_I]]
 uint64_t test_vpaddd_u64(uint64x2_t a) {
   return vpaddd_u64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vaddvq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vaddvq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a) #4
 // CHECK:   ret i64 [[VADDVQ_S64_I]]
 int64_t test_vaddvq_s64(int64x2_t a) {
   return vaddvq_s64(a);
 }
 
-// CHECK-LABEL: define i64 @test_vaddvq_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vaddvq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) #4
 // CHECK:   ret i64 [[VADDVQ_U64_I]]
 uint64_t test_vaddvq_u64(uint64x2_t a) {
   return vaddvq_u64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vadd_f64(
 // CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, %b
 // CHECK:   ret <1 x double> [[ADD_I]]
 float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) {
   return vadd_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vmul_f64(
 // CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %a, %b
 // CHECK:   ret <1 x double> [[MUL_I]]
 float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) {
   return vmul_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vdiv_f64(
 // CHECK:   [[DIV_I:%.*]] = fdiv <1 x double> %a, %b
 // CHECK:   ret <1 x double> [[DIV_I]]
 float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) {
   return vdiv_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK-LABEL: @test_vmla_f64(
 // CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
 // CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]]
 // CHECK:   ret <1 x double> [[ADD_I]]
@@ -22111,7 +21149,7 @@ float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vmla_f64(a, b, c);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK-LABEL: @test_vmls_f64(
 // CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]]
 // CHECK:   ret <1 x double> [[SUB_I]]
@@ -22119,220 +21157,191 @@ float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vmls_f64(a, b, c);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK-LABEL: @test_vfma_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK:   [[TMP6:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x double> [[TMP3]]) #4
-// CHECK:   ret <1 x double> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a) #4
+// CHECK:   ret <1 x double> [[TMP3]]
 float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vfma_f64(a, b, c);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK-LABEL: @test_vfms_f64(
 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK:   [[TMP6:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x double> [[TMP3]]) #4
-// CHECK:   ret <1 x double> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a) #4
+// CHECK:   ret <1 x double> [[TMP3]]
 float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vfms_f64(a, b, c);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vsub_f64(
 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, %b
 // CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
   return vsub_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vabd_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[VABD_I]], <1 x double> [[VABD1_I]]) #4
+// CHECK:   [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   ret <1 x double> [[VABD2_I]]
 float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) {
   return vabd_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vmax_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> [[VMAX_I]], <1 x double> [[VMAX1_I]]) #4
+// CHECK:   [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   ret <1 x double> [[VMAX2_I]]
 float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) {
   return vmax_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vmin_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> [[VMIN_I]], <1 x double> [[VMIN1_I]]) #4
+// CHECK:   [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   ret <1 x double> [[VMIN2_I]]
 float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) {
   return vmin_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vmaxnm_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> [[VMAXNM_I]], <1 x double> [[VMAXNM1_I]]) #4
+// CHECK:   [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   ret <1 x double> [[VMAXNM2_I]]
 float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) {
   return vmaxnm_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vminnm_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> [[VMINNM_I]], <1 x double> [[VMINNM1_I]]) #4
+// CHECK:   [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   ret <1 x double> [[VMINNM2_I]]
 float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) {
   return vminnm_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vabs_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vabs_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> [[VABS_I]]) #4
+// CHECK:   [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VABS1_I]]
 float64x1_t test_vabs_f64(float64x1_t a) {
   return vabs_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vneg_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vneg_f64(
 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %a
 // CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vneg_f64(float64x1_t a) {
   return vneg_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvt_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP2:%.*]] = fptosi <1 x double> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = fptosi <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP1]]
 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
   return vcvt_s64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvt_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP2:%.*]] = fptoui <1 x double> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = fptoui <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP1]]
 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
   return vcvt_u64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtn_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4
+// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCVTN1_I]]
 int64x1_t test_vcvtn_s64_f64(float64x1_t a) {
   return vcvtn_s64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtn_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4
+// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCVTN1_I]]
 uint64x1_t test_vcvtn_u64_f64(float64x1_t a) {
   return vcvtn_u64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtp_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4
+// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCVTP1_I]]
 int64x1_t test_vcvtp_s64_f64(float64x1_t a) {
   return vcvtp_s64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtp_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4
+// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCVTP1_I]]
 uint64x1_t test_vcvtp_u64_f64(float64x1_t a) {
   return vcvtp_u64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtm_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4
+// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCVTM1_I]]
 int64x1_t test_vcvtm_s64_f64(float64x1_t a) {
   return vcvtm_s64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtm_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4
+// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCVTM1_I]]
 uint64x1_t test_vcvtm_u64_f64(float64x1_t a) {
   return vcvtm_u64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvta_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4
+// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCVTA1_I]]
 int64x1_t test_vcvta_s64_f64(float64x1_t a) {
   return vcvta_s64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvta_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4
+// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x i64> [[VCVTA1_I]]
 uint64x1_t test_vcvta_u64_f64(float64x1_t a) {
   return vcvta_u64_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f64_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <1 x i64> [[TMP1]] to <1 x double>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <1 x i64> %a to <1 x double>
 // CHECK:   ret <1 x double> [[VCVT_I]]
 float64x1_t test_vcvt_f64_s64(int64x1_t a) {
   return vcvt_f64_s64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f64_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <1 x i64> [[TMP1]] to <1 x double>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <1 x i64> %a to <1 x double>
 // CHECK:   ret <1 x double> [[VCVT_I]]
 float64x1_t test_vcvt_f64_u64(uint64x1_t a) {
   return vcvt_f64_u64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 // CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
@@ -22341,7 +21350,7 @@ int64x1_t test_vcvt_n_s64_f64(float64x1_t a) {
   return vcvt_n_s64_f64(a, 64);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 // CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
@@ -22350,7 +21359,7 @@ uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) {
   return vcvt_n_u64_f64(a, 64);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_f64_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
@@ -22359,7 +21368,7 @@ float64x1_t test_vcvt_n_f64_s64(int64x1_t a) {
   return vcvt_n_f64_s64(a, 64);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_f64_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
@@ -22368,189 +21377,164 @@ float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) {
   return vcvt_n_f64_u64(a, 64);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrndn_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndn_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRNDN1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> [[VRNDN_I]]) #4
+// CHECK:   [[VRNDN1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRNDN1_I]]
 float64x1_t test_vrndn_f64(float64x1_t a) {
   return vrndn_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrnda_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrnda_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[VRNDA_I]]) #4
+// CHECK:   [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRNDA1_I]]
 float64x1_t test_vrnda_f64(float64x1_t a) {
   return vrnda_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrndp_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndp_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[VRNDP_I]]) #4
+// CHECK:   [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRNDP1_I]]
 float64x1_t test_vrndp_f64(float64x1_t a) {
   return vrndp_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrndm_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndm_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[VRNDM_I]]) #4
+// CHECK:   [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRNDM1_I]]
 float64x1_t test_vrndm_f64(float64x1_t a) {
   return vrndm_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrndx_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndx_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[VRNDX_I]]) #4
+// CHECK:   [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRNDX1_I]]
 float64x1_t test_vrndx_f64(float64x1_t a) {
   return vrndx_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrnd_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrnd_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[VRNDZ_I]]) #4
+// CHECK:   [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRNDZ1_I]]
 float64x1_t test_vrnd_f64(float64x1_t a) {
   return vrnd_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrndi_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndi_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[VRNDI_I]]) #4
+// CHECK:   [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRNDI1_I]]
 float64x1_t test_vrndi_f64(float64x1_t a) {
   return vrndi_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrsqrte_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrsqrte_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> [[VRSQRTE_V_I]]) #4
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRSQRTE_V1_I]]
 float64x1_t test_vrsqrte_f64(float64x1_t a) {
   return vrsqrte_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrecpe_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vrecpe_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> [[VRECPE_V_I]]) #4
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VRECPE_V1_I]]
 float64x1_t test_vrecpe_f64(float64x1_t a) {
   return vrecpe_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vsqrt_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vsqrt_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[TMP1]]) #4
+// CHECK:   [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a) #4
 // CHECK:   ret <1 x double> [[VSQRT_I]]
 float64x1_t test_vsqrt_f64(float64x1_t a) {
   return vsqrt_f64(a);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vrecps_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> [[VRECPS_V_I]], <1 x double> [[VRECPS_V1_I]]) #4
-// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <1 x double> [[VRECPS_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP2]]
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x double> [[VRECPS_V2_I]]
 float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) {
   return vrecps_f64(a, b);
 }
 
-// CHECK-LABEL: define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK-LABEL: @test_vrsqrts_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> [[VRSQRTS_V_I]], <1 x double> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> %a, <1 x double> %b) #4
 // CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP2]]
+// CHECK:   ret <1 x double> [[VRSQRTS_V2_I]]
 float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
   return vrsqrts_f64(a, b);
 }
 
-// CHECK-LABEL: define i32 @test_vminv_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vminv_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a) #4
 // CHECK:   ret i32 [[VMINV_S32_I]]
 int32_t test_vminv_s32(int32x2_t a) {
   return vminv_s32(a);
 }
 
-// CHECK-LABEL: define i32 @test_vminv_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vminv_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> %a) #4
 // CHECK:   ret i32 [[VMINV_U32_I]]
 uint32_t test_vminv_u32(uint32x2_t a) {
   return vminv_u32(a);
 }
 
-// CHECK-LABEL: define i32 @test_vmaxv_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmaxv_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a) #4
 // CHECK:   ret i32 [[VMAXV_S32_I]]
 int32_t test_vmaxv_s32(int32x2_t a) {
   return vmaxv_s32(a);
 }
 
-// CHECK-LABEL: define i32 @test_vmaxv_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmaxv_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> %a) #4
 // CHECK:   ret i32 [[VMAXV_U32_I]]
 uint32_t test_vmaxv_u32(uint32x2_t a) {
   return vmaxv_u32(a);
 }
 
-// CHECK-LABEL: define i32 @test_vaddv_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vaddv_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a) #4
 // CHECK:   ret i32 [[VADDV_S32_I]]
 int32_t test_vaddv_s32(int32x2_t a) {
   return vaddv_s32(a);
 }
 
-// CHECK-LABEL: define i32 @test_vaddv_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vaddv_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a) #4
 // CHECK:   ret i32 [[VADDV_U32_I]]
 uint32_t test_vaddv_u32(uint32x2_t a) {
   return vaddv_u32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vaddlv_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vaddlv_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a) #4
 // CHECK:   ret i64 [[VADDLV_S32_I]]
 int64_t test_vaddlv_s32(int32x2_t a) {
   return vaddlv_s32(a);
 }
 
-// CHECK-LABEL: define i64 @test_vaddlv_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vaddlv_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a) #4
 // CHECK:   ret i64 [[VADDLV_U32_I]]
 uint64_t test_vaddlv_u32(uint32x2_t a) {
   return vaddlv_u32(a);
diff --git a/test/CodeGen/aarch64-neon-misc.c b/test/CodeGen/aarch64-neon-misc.c
index 4ecf562a5d29..1342bbb0c8cb 100644
--- a/test/CodeGen/aarch64-neon-misc.c
+++ b/test/CodeGen/aarch64-neon-misc.c
@@ -6,7 +6,7 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define <8 x i8> @test_vceqz_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vceqz_s8(
 // CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
 // CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VCEQZ_I]]
@@ -14,57 +14,52 @@ uint8x8_t test_vceqz_s8(int8x8_t a) {
   return vceqz_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vceqz_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vceqz_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_s16(int16x4_t a) {
   return vceqz_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceqz_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vceqz_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_s32(int32x2_t a) {
   return vceqz_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vceqz_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_s64(int64x1_t a) {
   return vceqz_s64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vceqz_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_u64(uint64x1_t a) {
   return vceqz_u64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vceqz_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_p64(poly64x1_t a) {
   return vceqz_p64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqzq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_s8(
 // CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
 // CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[VCEQZ_I]]
@@ -72,37 +67,34 @@ uint8x16_t test_vceqzq_s8(int8x16_t a) {
   return vceqzq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vceqzq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_s16(int16x8_t a) {
   return vceqzq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqzq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_s32(int32x4_t a) {
   return vceqzq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vceqzq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_s64(int64x2_t a) {
   return vceqzq_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vceqz_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vceqz_u8(
 // CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
 // CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VCEQZ_I]]
@@ -110,27 +102,25 @@ uint8x8_t test_vceqz_u8(uint8x8_t a) {
   return vceqz_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vceqz_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vceqz_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_u16(uint16x4_t a) {
   return vceqz_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceqz_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vceqz_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_u32(uint32x2_t a) {
   return vceqz_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqzq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_u8(
 // CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
 // CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[VCEQZ_I]]
@@ -138,67 +128,61 @@ uint8x16_t test_vceqzq_u8(uint8x16_t a) {
   return vceqzq_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vceqzq_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_u16(uint16x8_t a) {
   return vceqzq_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqzq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_u32(uint32x4_t a) {
   return vceqzq_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vceqzq_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_u64(uint64x2_t a) {
   return vceqzq_u64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceqz_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vceqz_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp oeq <2 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp oeq <2 x float> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_f32(float32x2_t a) {
   return vceqz_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vceqz_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp oeq <1 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp oeq <1 x double> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_f64(float64x1_t a) {
   return vceqz_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqzq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp oeq <4 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp oeq <4 x float> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_f32(float32x4_t a) {
   return vceqzq_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vceqz_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vceqz_p8(
 // CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
 // CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VCEQZ_I]]
@@ -206,7 +190,7 @@ uint8x8_t test_vceqz_p8(poly8x8_t a) {
   return vceqz_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqzq_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_p8(
 // CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
 // CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[VCEQZ_I]]
@@ -214,47 +198,43 @@ uint8x16_t test_vceqzq_p8(poly8x16_t a) {
   return vceqzq_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vceqz_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vceqz_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_p16(poly16x4_t a) {
   return vceqz_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vceqzq_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_p16(poly16x8_t a) {
   return vceqzq_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vceqzq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp oeq <2 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp oeq <2 x double> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_f64(float64x2_t a) {
   return vceqzq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vceqzq_p64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_p64(poly64x2_t a) {
   return vceqzq_p64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcgez_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcgez_s8(
 // CHECK:   [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer
 // CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VCGEZ_I]]
@@ -262,37 +242,34 @@ uint8x8_t test_vcgez_s8(int8x8_t a) {
   return vcgez_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcgez_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vcgez_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp sge <4 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp sge <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VCGEZ_I]]
 uint16x4_t test_vcgez_s16(int16x4_t a) {
   return vcgez_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgez_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcgez_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp sge <2 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp sge <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_s32(int32x2_t a) {
   return vcgez_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcgez_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp sge <1 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp sge <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_s64(int64x1_t a) {
   return vcgez_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgezq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcgezq_s8(
 // CHECK:   [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer
 // CHECK:   [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[VCGEZ_I]]
@@ -300,77 +277,70 @@ uint8x16_t test_vcgezq_s8(int8x16_t a) {
   return vcgezq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgezq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vcgezq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp sge <8 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp sge <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VCGEZ_I]]
 uint16x8_t test_vcgezq_s16(int16x8_t a) {
   return vcgezq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgezq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcgezq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp sge <4 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp sge <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_s32(int32x4_t a) {
   return vcgezq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgezq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcgezq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp sge <2 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp sge <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_s64(int64x2_t a) {
   return vcgezq_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgez_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcgez_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp oge <2 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp oge <2 x float> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_f32(float32x2_t a) {
   return vcgez_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcgez_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcgez_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp oge <1 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp oge <1 x double> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_f64(float64x1_t a) {
   return vcgez_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgezq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcgezq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp oge <4 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp oge <4 x float> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_f32(float32x4_t a) {
   return vcgezq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgezq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcgezq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp oge <2 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp oge <2 x double> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_f64(float64x2_t a) {
   return vcgezq_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vclez_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclez_s8(
 // CHECK:   [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer
 // CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VCLEZ_I]]
@@ -378,37 +348,34 @@ uint8x8_t test_vclez_s8(int8x8_t a) {
   return vclez_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclez_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclez_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp sle <4 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp sle <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VCLEZ_I]]
 uint16x4_t test_vclez_s16(int16x4_t a) {
   return vclez_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclez_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclez_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp sle <2 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp sle <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_s32(int32x2_t a) {
   return vclez_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vclez_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp sle <1 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp sle <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_s64(int64x1_t a) {
   return vclez_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vclezq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclezq_s8(
 // CHECK:   [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer
 // CHECK:   [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[VCLEZ_I]]
@@ -416,77 +383,70 @@ uint8x16_t test_vclezq_s8(int8x16_t a) {
   return vclezq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vclezq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclezq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp sle <8 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp sle <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VCLEZ_I]]
 uint16x8_t test_vclezq_s16(int16x8_t a) {
   return vclezq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vclezq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclezq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp sle <4 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp sle <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_s32(int32x4_t a) {
   return vclezq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vclezq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vclezq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp sle <2 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp sle <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_s64(int64x2_t a) {
   return vclezq_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclez_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vclez_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp ole <2 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp ole <2 x float> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_f32(float32x2_t a) {
   return vclez_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vclez_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vclez_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp ole <1 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp ole <1 x double> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_f64(float64x1_t a) {
   return vclez_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vclezq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vclezq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp ole <4 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp ole <4 x float> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_f32(float32x4_t a) {
   return vclezq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vclezq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vclezq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp ole <2 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp ole <2 x double> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_f64(float64x2_t a) {
   return vclezq_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcgtz_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcgtz_s8(
 // CHECK:   [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer
 // CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VCGTZ_I]]
@@ -494,37 +454,34 @@ uint8x8_t test_vcgtz_s8(int8x8_t a) {
   return vcgtz_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcgtz_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vcgtz_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp sgt <4 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VCGTZ_I]]
 uint16x4_t test_vcgtz_s16(int16x4_t a) {
   return vcgtz_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgtz_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcgtz_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp sgt <2 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_s32(int32x2_t a) {
   return vcgtz_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcgtz_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp sgt <1 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_s64(int64x1_t a) {
   return vcgtz_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgtzq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcgtzq_s8(
 // CHECK:   [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer
 // CHECK:   [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[VCGTZ_I]]
@@ -532,77 +489,70 @@ uint8x16_t test_vcgtzq_s8(int8x16_t a) {
   return vcgtzq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgtzq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vcgtzq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp sgt <8 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VCGTZ_I]]
 uint16x8_t test_vcgtzq_s16(int16x8_t a) {
   return vcgtzq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgtzq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcgtzq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_s32(int32x4_t a) {
   return vcgtzq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgtzq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcgtzq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp sgt <2 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_s64(int64x2_t a) {
   return vcgtzq_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgtz_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcgtz_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp ogt <2 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp ogt <2 x float> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_f32(float32x2_t a) {
   return vcgtz_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcgtz_f64(<1 x double> %a) #0 {
+// CHECK-LABEL: @test_vcgtz_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp ogt <1 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp ogt <1 x double> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_f64(float64x1_t a) {
   return vcgtz_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgtzq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcgtzq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp ogt <4 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp ogt <4 x float> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_f32(float32x4_t a) {
   return vcgtzq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcgtzq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcgtzq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp ogt <2 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp ogt <2 x double> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_f64(float64x2_t a) {
   return vcgtzq_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcltz_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcltz_s8(
 // CHECK:   [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer
 // CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VCLTZ_I]]
@@ -610,37 +560,34 @@ uint8x8_t test_vcltz_s8(int8x8_t a) {
   return vcltz_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcltz_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vcltz_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp slt <4 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp slt <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VCLTZ_I]]
 uint16x4_t test_vcltz_s16(int16x4_t a) {
   return vcltz_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcltz_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcltz_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp slt <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_s32(int32x2_t a) {
   return vcltz_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcltz_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp slt <1 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp slt <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_s64(int64x1_t a) {
   return vcltz_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcltzq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcltzq_s8(
 // CHECK:   [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer
 // CHECK:   [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[VCLTZ_I]]
@@ -648,1593 +595,1454 @@ uint8x16_t test_vcltzq_s8(int8x16_t a) {
   return vcltzq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcltzq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vcltzq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   [[TMP1:%.*]] = icmp slt <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VCLTZ_I]]
 uint16x8_t test_vcltzq_s16(int16x8_t a) {
   return vcltzq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcltzq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcltzq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = icmp slt <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_s32(int32x4_t a) {
   return vcltzq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcltzq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcltzq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = icmp slt <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_s64(int64x2_t a) {
   return vcltzq_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcltz_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcltz_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp olt <2 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp olt <2 x float> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_f32(float32x2_t a) {
   return vcltz_f32(a);
 }
- 
-// CHECK-LABEL: define <1 x i64> @test_vcltz_f64(<1 x double> %a) #0 {
+
+// CHECK-LABEL: @test_vcltz_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp olt <1 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp olt <1 x double> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_f64(float64x1_t a) {
   return vcltz_f64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcltzq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcltzq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = fcmp olt <4 x float> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_f32(float32x4_t a) {
   return vcltzq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcltzq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcltzq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP2:%.*]] = fcmp olt <2 x double> [[TMP1]], zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = fcmp olt <2 x double> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_f64(float64x2_t a) {
   return vcltzq_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrev64_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrev64_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrev64_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_s8(
 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
   return vpaddl_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %a) #2
 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
   return vpaddl_s16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %a) #2
 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
   return vpaddl_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_u8(
 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   return vpaddl_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %a) #2
 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   return vpaddl_u16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %a) #2
 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   return vpaddl_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_s8(
 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   return vpaddlq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %a) #2
 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   return vpaddlq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %a) #2
 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   return vpaddlq_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_u8(
 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   return vpaddlq_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %a) #2
 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   return vpaddlq_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %a) #2
 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   return vpaddlq_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadal_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #2
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   return vpadal_s8(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadal_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
-// CHECK:   ret <2 x i32> [[TMP3]]
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   return vpadal_s16(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadal_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
-// CHECK:   ret <1 x i64> [[TMP3]]
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   return vpadal_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadal_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #2
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   return vpadal_u8(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadal_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
-// CHECK:   ret <2 x i32> [[TMP3]]
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   return vpadal_u16(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadal_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
-// CHECK:   ret <1 x i64> [[TMP3]]
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   return vpadal_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #2
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   return vpadalq_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
-// CHECK:   ret <4 x i32> [[TMP3]]
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   return vpadalq_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
-// CHECK:   ret <2 x i64> [[TMP3]]
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   return vpadalq_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #2
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a
+// CHECK:   ret <8 x i16> [[TMP1]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   return vpadalq_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
-// CHECK:   ret <4 x i32> [[TMP3]]
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   return vpadalq_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
-// CHECK:   ret <2 x i64> [[TMP3]]
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   return vpadalq_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqabs_s8(
 // CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
   return vqabs_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqabsq_s8(
 // CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
   return vqabsq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqabs_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #2
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %a) #2
 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQABS_V1_I]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
   return vqabs_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqabsq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %a) #2
 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VQABSQ_V1_I]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
   return vqabsq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqabs_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #2
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %a) #2
 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQABS_V1_I]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
   return vqabs_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqabsq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %a) #2
 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VQABSQ_V1_I]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
   return vqabsq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqabsq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> %a) #2
 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP1]]
+// CHECK:   ret <2 x i64> [[VQABSQ_V1_I]]
 int64x2_t test_vqabsq_s64(int64x2_t a) {
   return vqabsq_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqneg_s8(
 // CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
   return vqneg_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqnegq_s8(
 // CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
   return vqnegq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqneg_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #2
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %a) #2
 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQNEG_V1_I]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
   return vqneg_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqnegq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %a) #2
 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VQNEGQ_V1_I]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
   return vqnegq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqneg_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #2
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %a) #2
 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQNEG_V1_I]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
   return vqneg_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqnegq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %a) #2
 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VQNEGQ_V1_I]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
   return vqnegq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqnegq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> %a) #2
 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP1]]
+// CHECK:   ret <2 x i64> [[VQNEGQ_V1_I]]
 int64x2_t test_vqnegq_s64(int64x2_t a) {
   return vqnegq_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vneg_s8(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
 // CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
   return vneg_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vnegq_s8(
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
 // CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
   return vnegq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vneg_s16(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
 // CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
   return vneg_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vnegq_s16(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
 // CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
   return vnegq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vneg_s32(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
 // CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
   return vneg_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vnegq_s32(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
   return vnegq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vnegq_s64(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vnegq_s64(int64x2_t a) {
   return vnegq_s64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vneg_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
 // CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
   return vneg_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vnegq_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
 // CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
   return vnegq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vnegq_f64(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
 // CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vnegq_f64(float64x2_t a) {
   return vnegq_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vabs_s8(
 // CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
   return vabs_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vabsq_s8(
 // CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
   return vabsq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vabs_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) #2
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %a) #2
 // CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
   return vabs_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vabsq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) #2
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %a) #2
 // CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
   return vabsq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vabs_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) #2
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %a) #2
 // CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
   return vabs_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vabsq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) #2
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %a) #2
 // CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
   return vabsq_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vabsq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> [[VABS_I]]) #2
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> %a) #2
 // CHECK:   ret <2 x i64> [[VABS1_I]]
 int64x2_t test_vabsq_s64(int64x2_t a) {
   return vabsq_s64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vabs_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #2
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
   return vabs_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vabsq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #2
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
   return vabsq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vabsq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vabsq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[VABS_I]]) #2
+// CHECK:   [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VABS1_I]]
 float64x2_t test_vabsq_f64(float64x2_t a) {
   return vabsq_f64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuqadd_s8(
 // CHECK:   [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #2
 // CHECK:   ret <8 x i8> [[VUQADD_I]]
 int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) {
   return vuqadd_s8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuqaddq_s8(
 // CHECK:   [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #2
 // CHECK:   ret <16 x i8> [[VUQADD_I]]
 int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) {
   return vuqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuqadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[VUQADD_I]], <4 x i16> [[VUQADD1_I]]) #2
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #2
 // CHECK:   ret <4 x i16> [[VUQADD2_I]]
 int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) {
   return vuqadd_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuqaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[VUQADD_I]], <8 x i16> [[VUQADD1_I]]) #2
+// CHECK:   [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #2
 // CHECK:   ret <8 x i16> [[VUQADD2_I]]
 int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) {
   return vuqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuqadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[VUQADD_I]], <2 x i32> [[VUQADD1_I]]) #2
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #2
 // CHECK:   ret <2 x i32> [[VUQADD2_I]]
 int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) {
   return vuqadd_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuqaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[VUQADD_I]], <4 x i32> [[VUQADD1_I]]) #2
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #2
 // CHECK:   ret <4 x i32> [[VUQADD2_I]]
 int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) {
   return vuqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vuqaddq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[VUQADD_I]], <2 x i64> [[VUQADD1_I]]) #2
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #2
 // CHECK:   ret <2 x i64> [[VUQADD2_I]]
 int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) {
   return vuqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcls_s8(
 // CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
   return vcls_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclsq_s8(
 // CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
   return vclsq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vcls_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) #2
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %a) #2
 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VCLS_V1_I]]
 int16x4_t test_vcls_s16(int16x4_t a) {
   return vcls_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclsq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #2
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %a) #2
 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
   return vclsq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcls_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) #2
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %a) #2
 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VCLS_V1_I]]
 int32x2_t test_vcls_s32(int32x2_t a) {
   return vcls_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclsq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #2
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a) #2
 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
   return vclsq_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclz_s8(
 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
   return vclz_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclzq_s8(
 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
   return vclzq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclz_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #2
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 int16x4_t test_vclz_s16(int16x4_t a) {
   return vclz_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclzq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #2
 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
   return vclzq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclz_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #2
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 int32x2_t test_vclz_s32(int32x2_t a) {
   return vclz_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclzq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #2
 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
   return vclzq_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclz_u8(
 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
   return vclz_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclzq_u8(
 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   return vclzq_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclz_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #2
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
   return vclz_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclzq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #2
 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   return vclzq_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclz_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #2
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
   return vclz_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclzq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #2
 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   return vclzq_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcnt_s8(
 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
   return vcnt_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcntq_s8(
 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
   return vcntq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcnt_u8(
 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   return vcnt_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcntq_u8(
 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   return vcntq_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcnt_p8(
 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   return vcnt_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcntq_p8(
 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   return vcntq_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvn_s8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
   return vmvn_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_s8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
   return vmvnq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmvn_s16(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
   return vmvn_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_s16(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
   return vmvnq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmvn_s32(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
 // CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
   return vmvn_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_s32(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
   return vmvnq_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvn_u8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   return vmvn_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_u8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   return vmvnq_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmvn_u16(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   return vmvn_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_u16(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   return vmvnq_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmvn_u32(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
 // CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   return vmvn_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_u32(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   return vmvnq_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvn_p8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   return vmvn_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_p8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   return vmvnq_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrbit_s8(
 // CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VRBIT_I]]
 int8x8_t test_vrbit_s8(int8x8_t a) {
   return vrbit_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrbitq_s8(
 // CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VRBIT_I]]
 int8x16_t test_vrbitq_s8(int8x16_t a) {
   return vrbitq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrbit_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrbit_u8(
 // CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VRBIT_I]]
 uint8x8_t test_vrbit_u8(uint8x8_t a) {
   return vrbit_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrbitq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrbitq_u8(
 // CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VRBIT_I]]
 uint8x16_t test_vrbitq_u8(uint8x16_t a) {
   return vrbitq_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrbit_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrbit_p8(
 // CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
 // CHECK:   ret <8 x i8> [[VRBIT_I]]
 poly8x8_t test_vrbit_p8(poly8x8_t a) {
   return vrbit_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrbitq_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrbitq_p8(
 // CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
 // CHECK:   ret <16 x i8> [[VRBIT_I]]
 poly8x16_t test_vrbitq_p8(poly8x16_t a) {
   return vrbitq_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
   return vmovn_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
   return vmovn_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vmovn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
   return vmovn_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   return vmovn_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   return vmovn_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vmovn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   return vmovn_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmovn_high_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) {
   return vmovn_high_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmovn_high_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) {
   return vmovn_high_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vmovn_high_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) {
   return vmovn_high_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmovn_high_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) {
   return vmovn_high_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmovn_high_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) {
   return vmovn_high_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vmovn_high_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) {
   return vmovn_high_u64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqmovun_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %a) #2
 // CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 int8x8_t test_vqmovun_s16(int16x8_t a) {
   return vqmovun_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqmovun_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %a) #2
 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQMOVUN_V1_I]]
 int16x4_t test_vqmovun_s32(int32x4_t a) {
   return vqmovun_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqmovun_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %a) #2
 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQMOVUN_V1_I]]
 int32x2_t test_vqmovun_s64(int64x2_t a) {
   return vqmovun_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqmovun_high_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %b) #2
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovun_high_s16(int8x8_t a, int16x8_t b) {
   return vqmovun_high_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqmovun_high_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %b) #2
 // CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVUN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovun_high_s32(int16x4_t a, int32x4_t b) {
   return vqmovun_high_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqmovun_high_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %b) #2
 // CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVUN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovun_high_s64(int32x2_t a, int64x2_t b) {
   return vqmovun_high_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %a) #2
 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
   return vqmovn_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %a) #2
 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
   return vqmovn_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %a) #2
 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
   return vqmovn_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqmovn_high_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %b) #2
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) {
   return vqmovn_high_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqmovn_high_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %b) #2
 // CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) {
   return vqmovn_high_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqmovn_high_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %b) #2
 // CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) {
   return vqmovn_high_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %a) #2
 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   return vqmovn_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %a) #2
 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   return vqmovn_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %a) #2
 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   return vqmovn_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqmovn_high_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %b) #2
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) {
   return vqmovn_high_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqmovn_high_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %b) #2
 // CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) {
   return vqmovn_high_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqmovn_high_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %b) #2
 // CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) {
   return vqmovn_high_u64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s8(
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   ret <8 x i16> [[VSHLL_N]]
@@ -2242,7 +2050,7 @@ int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 8);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
@@ -2252,7 +2060,7 @@ int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 16);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
@@ -2262,7 +2070,7 @@ int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 32);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u8(
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   ret <8 x i16> [[VSHLL_N]]
@@ -2270,7 +2078,7 @@ uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 8);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
@@ -2280,7 +2088,7 @@ uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 16);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
@@ -2290,7 +2098,7 @@ uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 32);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -2299,7 +2107,7 @@ int16x8_t test_vshll_high_n_s8(int8x16_t a) {
   return vshll_high_n_s8(a, 8);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -2310,7 +2118,7 @@ int32x4_t test_vshll_high_n_s16(int16x8_t a) {
   return vshll_high_n_s16(a, 16);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -2321,7 +2129,7 @@ int64x2_t test_vshll_high_n_s32(int32x4_t a) {
   return vshll_high_n_s32(a, 32);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -2330,7 +2138,7 @@ uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
   return vshll_high_n_u8(a, 8);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -2341,7 +2149,7 @@ uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
   return vshll_high_n_u16(a, 16);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_high_n_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -2352,10 +2160,9 @@ uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
   return vshll_high_n_u32(a, 32);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #2
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) #2
 // CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
 // CHECK:   ret <4 x half> [[TMP1]]
@@ -2363,10 +2170,9 @@ float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vcvt_high_f16_f32(<4 x half> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcvt_high_f16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCVT_F16_F32_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I_I]]) #2
+// CHECK:   [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %b) #2
 // CHECK:   [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2375,693 +2181,617 @@ float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) {
   return vcvt_high_f16_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVT_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = fptrunc <2 x double> %a to <2 x float>
 // CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_f64(float64x2_t a) {
   return vcvt_f32_f64(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vcvt_high_f32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVT_I_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+// CHECK:   [[VCVT_I_I:%.*]] = fptrunc <2 x double> %b to <2 x float>
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) {
   return vcvt_high_f32_f64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtx_f32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTX_F32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I]]) #2
+// CHECK:   [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x float> [[VCVTX_F32_V1_I]]
 float32x2_t test_vcvtx_f32_f64(float64x2_t a) {
   return vcvtx_f32_f64(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vcvtx_high_f32_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VCVTX_F32_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I_I]]) #2
+// CHECK:   [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) #2
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) {
   return vcvtx_high_f32_f64(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
 // CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #2
 // CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   ret <4 x float> [[VCVT_F32_F161_I]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   return vcvt_f32_f16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vcvt_high_f32_f16(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8>
 // CHECK:   [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) #2
 // CHECK:   [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   ret <4 x float> [[VCVT_F32_F161_I_I]]
 float32x4_t test_vcvt_high_f32_f16(float16x8_t a) {
   return vcvt_high_f32_f16(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVT_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+// CHECK:   [[VCVT_I:%.*]] = fpext <2 x float> %a to <2 x double>
 // CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvt_f64_f32(float32x2_t a) {
   return vcvt_f64_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_high_f64_f32(
 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVT_I_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+// CHECK:   [[VCVT_I_I:%.*]] = fpext <2 x float> [[SHUFFLE_I_I]] to <2 x double>
 // CHECK:   ret <2 x double> [[VCVT_I_I]]
 float64x2_t test_vcvt_high_f64_f32(float32x4_t a) {
   return vcvt_high_f64_f32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndn_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDN1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> [[VRNDN_I]]) #2
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRNDN1_I]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
   return vrndn_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndnq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDN1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> [[VRNDN_I]]) #2
+// CHECK:   [[VRNDN1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRNDN1_I]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
   return vrndnq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndnq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRNDN1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> [[VRNDN_I]]) #2
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRNDN1_I]]
 float64x2_t test_vrndnq_f64(float64x2_t a) {
   return vrndnq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrnda_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[VRNDA_I]]) #2
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRNDA1_I]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
   return vrnda_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndaq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[VRNDA_I]]) #2
+// CHECK:   [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRNDA1_I]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
   return vrndaq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndaq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) #2
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRNDA1_I]]
 float64x2_t test_vrndaq_f64(float64x2_t a) {
   return vrndaq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndp_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[VRNDP_I]]) #2
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRNDP1_I]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
   return vrndp_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndpq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VRNDP_I]]) #2
+// CHECK:   [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRNDP1_I]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
   return vrndpq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndpq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) #2
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRNDP1_I]]
 float64x2_t test_vrndpq_f64(float64x2_t a) {
   return vrndpq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndm_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[VRNDM_I]]) #2
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRNDM1_I]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
   return vrndm_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndmq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[VRNDM_I]]) #2
+// CHECK:   [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRNDM1_I]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
   return vrndmq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndmq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[VRNDM_I]]) #2
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRNDM1_I]]
 float64x2_t test_vrndmq_f64(float64x2_t a) {
   return vrndmq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndx_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[VRNDX_I]]) #2
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRNDX1_I]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
   return vrndx_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndxq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[VRNDX_I]]) #2
+// CHECK:   [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRNDX1_I]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
   return vrndxq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndxq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[VRNDX_I]]) #2
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRNDX1_I]]
 float64x2_t test_vrndxq_f64(float64x2_t a) {
   return vrndxq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrnd_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[VRNDZ_I]]) #2
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRNDZ1_I]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
   return vrnd_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VRNDZ_I]]) #2
+// CHECK:   [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRNDZ1_I]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
   return vrndq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[VRNDZ_I]]) #2
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRNDZ1_I]]
 float64x2_t test_vrndq_f64(float64x2_t a) {
   return vrndq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndi_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[VRNDI_I]]) #2
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRNDI1_I]]
 float32x2_t test_vrndi_f32(float32x2_t a) {
   return vrndi_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrndiq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VRNDI_I]]) #2
+// CHECK:   [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRNDI1_I]]
 float32x4_t test_vrndiq_f32(float32x4_t a) {
   return vrndiq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrndiq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[VRNDI_I]]) #2
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRNDI1_I]]
 float64x2_t test_vrndiq_f64(float64x2_t a) {
   return vrndiq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP2:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = fptosi <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = fptosi <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP2:%.*]] = fptosi <2 x double> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = fptosi <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vcvtq_s64_f64(float64x2_t a) {
   return vcvtq_s64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP2:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = fptoui <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = fptoui <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP2:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = fptoui <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 uint64x2_t test_vcvtq_u64_f64(float64x2_t a) {
   return vcvtq_u64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtn_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTN1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
   return vcvtn_s32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtnq_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTN1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
   return vcvtnq_s32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtnq_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x i64> [[VCVTN1_I]]
 int64x2_t test_vcvtnq_s64_f64(float64x2_t a) {
   return vcvtnq_s64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtn_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTN1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
   return vcvtn_u32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtnq_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTN1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
   return vcvtnq_u32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtnq_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x i64> [[VCVTN1_I]]
 uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) {
   return vcvtnq_u64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtp_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTP1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
   return vcvtp_s32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtpq_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTP1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
   return vcvtpq_s32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtpq_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x i64> [[VCVTP1_I]]
 int64x2_t test_vcvtpq_s64_f64(float64x2_t a) {
   return vcvtpq_s64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtp_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTP1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
   return vcvtp_u32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtpq_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTP1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
   return vcvtpq_u32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtpq_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x i64> [[VCVTP1_I]]
 uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) {
   return vcvtpq_u64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtm_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTM1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
   return vcvtm_s32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtmq_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTM1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
   return vcvtmq_s32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtmq_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x i64> [[VCVTM1_I]]
 int64x2_t test_vcvtmq_s64_f64(float64x2_t a) {
   return vcvtmq_s64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtm_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTM1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
   return vcvtm_u32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtmq_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTM1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
   return vcvtmq_u32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtmq_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x i64> [[VCVTM1_I]]
 uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) {
   return vcvtmq_u64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvta_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTA1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
   return vcvta_s32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtaq_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTA1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
   return vcvtaq_s32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtaq_s64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x i64> [[VCVTA1_I]]
 int64x2_t test_vcvtaq_s64_f64(float64x2_t a) {
   return vcvtaq_s64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvta_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTA1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
   return vcvta_u32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtaq_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTA1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
   return vcvtaq_u32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vcvtaq_u64_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x i64> [[VCVTA1_I]]
 uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) {
   return vcvtaq_u64_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrsqrte_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #2
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
   return vrsqrte_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrsqrteq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #2
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
   return vrsqrteq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrsqrteq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> [[VRSQRTEQ_V_I]]) #2
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRSQRTEQ_V1_I]]
 float64x2_t test_vrsqrteq_f64(float64x2_t a) {
   return vrsqrteq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrecpe_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #2
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
   return vrecpe_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrecpeq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #2
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   return vrecpeq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vrecpeq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> [[VRECPEQ_V_I]]) #2
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VRECPEQ_V1_I]]
 float64x2_t test_vrecpeq_f64(float64x2_t a) {
   return vrecpeq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrecpe_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #2
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %a) #2
 // CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   return vrecpe_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrecpeq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #2
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %a) #2
 // CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   return vrecpeq_u32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vsqrt_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[TMP1]]) #2
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x float> [[VSQRT_I]]
 float32x2_t test_vsqrt_f32(float32x2_t a) {
   return vsqrt_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vsqrtq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x float> [[VSQRT_I]]
 float32x4_t test_vsqrtq_f32(float32x4_t a) {
   return vsqrtq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
+// CHECK-LABEL: @test_vsqrtq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) #2
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #2
 // CHECK:   ret <2 x double> [[VSQRT_I]]
 float64x2_t test_vsqrtq_f64(float64x2_t a) {
   return vsqrtq_f64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float>
 // CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   return vcvt_f32_s32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float>
 // CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   return vcvt_f32_u32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float>
 // CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   return vcvtq_f32_s32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float>
 // CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   return vcvtq_f32_u32(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_f64_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i64> %a to <2 x double>
 // CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_s64(int64x2_t a) {
   return vcvtq_f64_s64(a);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_f64_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i64> %a to <2 x double>
 // CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_u64(uint64x2_t a) {
   return vcvtq_f64_u64(a);
diff --git a/test/CodeGen/aarch64-neon-perm.c b/test/CodeGen/aarch64-neon-perm.c
index ca9f15d22c67..5b8a99c993fe 100644
--- a/test/CodeGen/aarch64-neon-perm.c
+++ b/test/CodeGen/aarch64-neon-perm.c
@@ -4,889 +4,889 @@
 // Test new aarch64 intrinsics and types
 #include <arm_neon.h>
 
-// CHECK-LABEL: define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) {
   return vuzp1_s8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) {
   return vuzp1q_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) {
   return vuzp1_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) {
   return vuzp1q_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) {
   return vuzp1_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) {
   return vuzp1q_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) {
   return vuzp1q_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp1_u8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) {
   return vuzp1q_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp1_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) {
   return vuzp1q_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp1_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) {
   return vuzp1q_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) {
   return vuzp1q_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) {
   return vuzp1_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) {
   return vuzp1q_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_f64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) {
   return vuzp1q_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp1_p8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) {
   return vuzp1q_p8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp1_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp1_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp1q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) {
   return vuzp1q_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) {
   return vuzp2_s8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) {
   return vuzp2q_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) {
   return vuzp2_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) {
   return vuzp2q_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) {
   return vuzp2_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) {
   return vuzp2q_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) {
   return vuzp2q_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp2_u8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) {
   return vuzp2q_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp2_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) {
   return vuzp2q_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp2_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) {
   return vuzp2q_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) {
   return vuzp2q_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) {
   return vuzp2_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) {
   return vuzp2q_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_f64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) {
   return vuzp2q_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp2_p8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) {
   return vuzp2q_p8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp2_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp2_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp2q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) {
   return vuzp2q_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip1_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) {
   return vzip1_s8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) {
   return vzip1q_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip1_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) {
   return vzip1_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) {
   return vzip1q_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip1_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) {
   return vzip1_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) {
   return vzip1q_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) {
   return vzip1q_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip1_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) {
   return vzip1_u8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) {
   return vzip1q_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip1_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) {
   return vzip1_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) {
   return vzip1q_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip1_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) {
   return vzip1_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) {
   return vzip1q_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) {
   return vzip1q_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vzip1_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) {
   return vzip1_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) {
   return vzip1q_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_f64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) {
   return vzip1q_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip1_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) {
   return vzip1_p8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) {
   return vzip1q_p8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip1_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) {
   return vzip1_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip1q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) {
   return vzip1q_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip2_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) {
   return vzip2_s8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) {
   return vzip2q_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip2_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) {
   return vzip2_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) {
   return vzip2q_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip2_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) {
   return vzip2_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) {
   return vzip2q_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) {
   return vzip2q_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip2_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) {
   return vzip2_u8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) {
   return vzip2q_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip2_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) {
   return vzip2_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) {
   return vzip2q_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip2_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) {
   return vzip2_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) {
   return vzip2q_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) {
   return vzip2q_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vzip2_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) {
   return vzip2_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) {
   return vzip2q_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_f64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) {
   return vzip2q_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip2_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) {
   return vzip2_p8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) {
   return vzip2q_p8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip2_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) {
   return vzip2_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip2q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) {
   return vzip2q_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) {
   return vtrn1_s8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) {
   return vtrn1q_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) {
   return vtrn1_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
   return vtrn1q_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
   return vtrn1_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
   return vtrn1q_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
   return vtrn1q_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn1_u8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) {
   return vtrn1q_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn1_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
   return vtrn1q_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn1_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
   return vtrn1q_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
   return vtrn1q_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
   return vtrn1_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
   return vtrn1q_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_f64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
   return vtrn1q_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn1_p8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) {
   return vtrn1q_p8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn1_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn1_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn1q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) {
   return vtrn1q_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) {
   return vtrn2_s8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) {
   return vtrn2q_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) {
   return vtrn2_s16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
   return vtrn2q_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
   return vtrn2_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
   return vtrn2q_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
   return vtrn2q_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn2_u8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) {
   return vtrn2q_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn2_u16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
   return vtrn2q_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn2_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
   return vtrn2q_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
   return vtrn2q_u64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
   return vtrn2_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
   return vtrn2q_f32(a, b);
 }
 
-// CHECK-LABEL: define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_f64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
   return vtrn2q_f64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn2_p8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) {
   return vtrn2q_p8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn2_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn2_p16(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn2q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
   return vtrn2q_p16(a, b);
 }
 
-// CHECK-LABEL: define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
@@ -910,7 +910,7 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK-LABEL: define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
@@ -918,26 +918,25 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
-// CHECK-LABEL: define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vuzp_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
@@ -945,26 +944,25 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
-// CHECK-LABEL: define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vuzp_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
@@ -987,7 +985,8 @@ int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
-// CHECK-LABEL: define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vuzp_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
@@ -995,26 +994,25 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
-// CHECK-LABEL: define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vuzp_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
@@ -1022,26 +1020,25 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
-// CHECK-LABEL: define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) #0 {
+
+// CHECK-LABEL: @test_vuzp_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
@@ -1049,26 +1046,25 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x float>] [[TMP11]], [2 x <2 x float>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP9]], [2 x <2 x float>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
-// CHECK-LABEL: define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vuzp_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
@@ -1091,7 +1087,8 @@ float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
-// CHECK-LABEL: define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vuzp_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
@@ -1099,26 +1096,25 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
-// CHECK-LABEL: define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
@@ -1141,7 +1137,8 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
-// CHECK-LABEL: define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
@@ -1149,26 +1146,25 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
-// CHECK-LABEL: define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
@@ -1176,26 +1172,25 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
-// CHECK-LABEL: define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
@@ -1218,7 +1213,8 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
-// CHECK-LABEL: define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
@@ -1226,26 +1222,25 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
-// CHECK-LABEL: define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
@@ -1253,26 +1248,25 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
-// CHECK-LABEL: define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
@@ -1280,26 +1274,25 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x float>] [[TMP11]], [2 x <4 x float>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP9]], [2 x <4 x float>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
-// CHECK-LABEL: define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
@@ -1322,7 +1315,8 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
-// CHECK-LABEL: define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vuzpq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
@@ -1330,27 +1324,25 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
-// CHECK-LABEL: define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
@@ -1374,7 +1366,7 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK-LABEL: define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
@@ -1382,26 +1374,25 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
-// CHECK-LABEL: define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vzip_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
@@ -1409,26 +1400,25 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
-// CHECK-LABEL: define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vzip_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
@@ -1451,7 +1441,8 @@ int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
-// CHECK-LABEL: define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vzip_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
@@ -1459,26 +1450,25 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
-// CHECK-LABEL: define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vzip_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
@@ -1486,26 +1476,25 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
-// CHECK-LABEL: define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) #0 {
+
+// CHECK-LABEL: @test_vzip_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
@@ -1513,26 +1502,25 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x float>] [[TMP11]], [2 x <2 x float>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP9]], [2 x <2 x float>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
-// CHECK-LABEL: define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vzip_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
@@ -1555,7 +1543,8 @@ float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
-// CHECK-LABEL: define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vzip_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
@@ -1563,26 +1552,25 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
-// CHECK-LABEL: define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
@@ -1605,7 +1593,8 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
-// CHECK-LABEL: define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
@@ -1613,26 +1602,25 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
-// CHECK-LABEL: define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
@@ -1640,26 +1628,25 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
-// CHECK-LABEL: define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
@@ -1682,7 +1669,8 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
-// CHECK-LABEL: define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
@@ -1690,26 +1678,25 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
-// CHECK-LABEL: define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
@@ -1717,26 +1704,25 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
-// CHECK-LABEL: define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
@@ -1744,26 +1730,25 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x float>] [[TMP11]], [2 x <4 x float>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP9]], [2 x <4 x float>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
-// CHECK-LABEL: define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
@@ -1786,7 +1771,8 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
-// CHECK-LABEL: define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vzipq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
@@ -1794,27 +1780,25 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
 }
 
-// CHECK-LABEL: define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
@@ -1838,7 +1822,7 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK-LABEL: define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
@@ -1846,26 +1830,25 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
-// CHECK-LABEL: define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vtrn_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
@@ -1873,26 +1856,25 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
-// CHECK-LABEL: define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vtrn_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
@@ -1915,7 +1897,8 @@ int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
-// CHECK-LABEL: define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vtrn_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
@@ -1923,26 +1906,25 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
-// CHECK-LABEL: define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vtrn_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
@@ -1950,26 +1932,25 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
-// CHECK-LABEL: define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) #0 {
+
+// CHECK-LABEL: @test_vtrn_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
@@ -1977,26 +1958,25 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
 // CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
-// CHECK:   store [2 x <2 x float>] [[TMP11]], [2 x <2 x float>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP9]], [2 x <2 x float>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
-// CHECK-LABEL: define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vtrn_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
@@ -2019,7 +1999,8 @@ float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
-// CHECK-LABEL: define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vtrn_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
@@ -2027,26 +2008,25 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
-// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
-// CHECK-LABEL: define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
@@ -2069,7 +2049,8 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
-// CHECK-LABEL: define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
@@ -2077,26 +2058,25 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
-// CHECK-LABEL: define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
@@ -2104,26 +2084,25 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
-// CHECK-LABEL: define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
@@ -2146,7 +2125,8 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
-// CHECK-LABEL: define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
@@ -2154,26 +2134,25 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
-// CHECK-LABEL: define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
@@ -2181,26 +2160,25 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
-// CHECK-LABEL: define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
@@ -2208,26 +2186,25 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
-// CHECK:   store [2 x <4 x float>] [[TMP11]], [2 x <4 x float>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP9]], [2 x <4 x float>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
-// CHECK-LABEL: define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
@@ -2250,7 +2227,8 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
-// CHECK-LABEL: define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+
+// CHECK-LABEL: @test_vtrnq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
@@ -2258,22 +2236,20 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
-// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
-// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
-// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP12]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
 }
diff --git a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
index 39aab2540e8a..ac5a090fd2e6 100644
--- a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
+++ b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
@@ -47,9 +47,7 @@ float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
 }
 
 // CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP1]] to double
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %a to double
 // CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
 // CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
 // CHECK:   ret <1 x double> [[TMP4]]
diff --git a/test/CodeGen/aarch64-poly64.c b/test/CodeGen/aarch64-poly64.c
index 762ca94e5e57..eadeda640434 100644
--- a/test/CodeGen/aarch64-poly64.c
+++ b/test/CodeGen/aarch64-poly64.c
@@ -23,11 +23,7 @@ uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) {
 }
 
 // CHECK-LABEL: define <1 x i64> @test_vtst_p64(<1 x i64> %a, <1 x i64> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> %a, %b
 // CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
 // CHECK:   ret <1 x i64> [[VTST_I]]
@@ -36,11 +32,7 @@ uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) {
 }
 
 // CHECK-LABEL: define <2 x i64> @test_vtstq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> %a, %b
 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VTST_I]]
@@ -49,15 +41,9 @@ uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) {
 }
 
 // CHECK-LABEL: define <1 x i64> @test_vbsl_p64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %a, <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %c
 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <1 x i64> [[VBSL5_I]]
 poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
@@ -65,15 +51,9 @@ poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
 }
 
 // CHECK-LABEL: define <2 x i64> @test_vbslq_p64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
-// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %a, <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %c
 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK:   ret <2 x i64> [[VBSL5_I]]
 poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) {
diff --git a/test/CodeGen/address-safety-attr-kasan.cpp b/test/CodeGen/address-safety-attr-kasan.cpp
index c84ba88291da..4d8333d2ffea 100644
--- a/test/CodeGen/address-safety-attr-kasan.cpp
+++ b/test/CodeGen/address-safety-attr-kasan.cpp
@@ -8,31 +8,31 @@
 int HasSanitizeAddress() {
   return 1;
 }
-// CHECK-NOASAN: {{Function Attrs: nounwind$}}
-// CHECK-ASAN: Function Attrs: nounwind sanitize_address
-// CHECK-KASAN: Function Attrs: nounwind sanitize_address
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: Function Attrs: noinline nounwind sanitize_address
+// CHECK-KASAN: Function Attrs: noinline nounwind sanitize_address
 
 __attribute__((no_sanitize("address")))
 int NoSanitizeQuoteAddress() {
   return 0;
 }
-// CHECK-NOASAN: {{Function Attrs: nounwind$}}
-// CHECK-ASAN: {{Function Attrs: nounwind$}}
-// CHECK-KASAN: {{Function Attrs: nounwind sanitize_address$}}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
 
 __attribute__((no_sanitize_address))
 int NoSanitizeAddress() {
   return 0;
 }
-// CHECK-NOASAN: {{Function Attrs: nounwind$}}
-// CHECK-ASAN: {{Function Attrs: nounwind$}}
-// CHECK-KASAN: {{Function Attrs: nounwind sanitize_address$}}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
 
 __attribute__((no_sanitize("kernel-address")))
 int NoSanitizeKernelAddress() {
   return 0;
 }
 
-// CHECK-NOASAN: {{Function Attrs: nounwind$}}
-// CHECK-ASAN: {{Function Attrs: nounwind sanitize_address$}}
-// CHECK-KASAN: {{Function Attrs: nounwind$}}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind$}}
diff --git a/test/CodeGen/address-safety-attr.cpp b/test/CodeGen/address-safety-attr.cpp
index 402d6bad909d..60faeddbd4ce 100644
--- a/test/CodeGen/address-safety-attr.cpp
+++ b/test/CodeGen/address-safety-attr.cpp
@@ -9,12 +9,11 @@ int DefinedInDifferentFile(int *a);
 // RUN: echo "fun:*BlacklistedFunction*" > %t.func.blacklist
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-apple-darwin -emit-llvm -o - %s -include %t.extra-source.cpp -fsanitize=address -fsanitize-blacklist=%t.func.blacklist | FileCheck -check-prefix=BLFUNC %s
 
-// RUN: echo "src:%s" > %t.file.blacklist
+// The blacklist file uses regexps, so escape backslashes, which are common in
+// Windows paths.
+// RUN: echo "src:%s" | sed -e 's/\\/\\\\/g' > %t.file.blacklist
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-apple-darwin -emit-llvm -o - %s -include %t.extra-source.cpp -fsanitize=address -fsanitize-blacklist=%t.file.blacklist | FileCheck -check-prefix=BLFILE %s
 
-// FIXME: %t.file.blacklist is like "src:x:\path\to\clang\test\CodeGen\address-safety-attr.cpp"
-// REQUIRES: shell
-
 // The sanitize_address attribute should be attached to functions
 // when AddressSanitizer is enabled, unless no_sanitize_address attribute
 // is present.
@@ -144,13 +143,13 @@ int global2 = *(int*)((char*)&global1+1);
 // BLFUNC: @__cxx_global_var_init{{.*}}[[WITH]]
 // ASAN: @__cxx_global_var_init{{.*}}[[WITH]]
 
-// WITHOUT: attributes [[NOATTR]] = { nounwind{{.*}} }
+// WITHOUT: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
 
-// BLFILE: attributes [[WITH]] = { nounwind sanitize_address{{.*}} }
-// BLFILE: attributes [[NOATTR]] = { nounwind{{.*}} }
+// BLFILE: attributes [[WITH]] = { noinline nounwind sanitize_address{{.*}} }
+// BLFILE: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
 
-// BLFUNC: attributes [[WITH]] = { nounwind sanitize_address{{.*}} }
-// BLFUNC: attributes [[NOATTR]] = { nounwind{{.*}} }
+// BLFUNC: attributes [[WITH]] = { noinline nounwind sanitize_address{{.*}} }
+// BLFUNC: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
 
-// ASAN: attributes [[WITH]] = { nounwind sanitize_address{{.*}} }
-// ASAN: attributes [[NOATTR]] = { nounwind{{.*}} }
+// ASAN: attributes [[WITH]] = { noinline nounwind sanitize_address{{.*}} }
+// ASAN: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/address-space-field1.c b/test/CodeGen/address-space-field1.c
index 109c69201cb2..64b51aefc9a1 100644
--- a/test/CodeGen/address-space-field1.c
+++ b/test/CodeGen/address-space-field1.c
@@ -37,4 +37,4 @@ void test_addrspace(__addr1 S* p1, __addr2 S*p2) {
   p1->b = p2->a;
 }
 
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/alias.c b/test/CodeGen/alias.c
index c34dcf5ca229..6ec12702d86b 100644
--- a/test/CodeGen/alias.c
+++ b/test/CodeGen/alias.c
@@ -77,9 +77,9 @@ int outer_weak(int a) { return inner_weak_a(a); }
 // CHECKCC: call arm_aapcs_vfpcc  i32 @inner_weak(i32 %{{.*}})
 // CHECKCC: define internal arm_aapcs_vfpcc i32 @inner_weak(i32 %a) [[NUW]] {
 
-// CHECKBASIC: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECKBASIC: attributes [[NUW]] = { noinline nounwind{{.*}} }
 
-// CHECKCC: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECKCC: attributes [[NUW]] = { noinline nounwind{{.*}} }
 
 void test8_bar() {}
 void test8_foo() __attribute__((weak, alias("test8_bar")));
diff --git a/test/CodeGen/alloc-size.c b/test/CodeGen/alloc-size.c
new file mode 100644
index 000000000000..1e503f0579c9
--- /dev/null
+++ b/test/CodeGen/alloc-size.c
@@ -0,0 +1,352 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm %s -o - 2>&1 | FileCheck %s
+
+#define NULL ((void *)0)
+
+int gi;
+
+typedef unsigned long size_t;
+
+// CHECK-DAG-RE: define void @my_malloc({{.*}}) #[[MALLOC_ATTR_NUMBER:[0-9]+]]
+// N.B. LLVM's allocsize arguments are base-0, whereas ours are base-1 (for
+// compat with GCC)
+// CHECK-DAG-RE: attributes #[[MALLOC_ATTR_NUMBER]] = {.*allocsize(0).*}
+void *my_malloc(size_t) __attribute__((alloc_size(1)));
+
+// CHECK-DAG-RE: define void @my_calloc({{.*}}) #[[CALLOC_ATTR_NUMBER:[0-9]+]]
+// CHECK-DAG-RE: attributes #[[CALLOC_ATTR_NUMBER]] = {.*allocsize(0, 1).*}
+void *my_calloc(size_t, size_t) __attribute__((alloc_size(1, 2)));
+
+// CHECK-LABEL: @test1
+void test1() {
+  void *const vp = my_malloc(100);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(vp, 0);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(vp, 1);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(vp, 2);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(vp, 3);
+
+  void *const arr = my_calloc(100, 5);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(arr, 0);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(arr, 1);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(arr, 2);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(arr, 3);
+
+  // CHECK: store i32 100
+  gi = __builtin_object_size(my_malloc(100), 0);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(my_malloc(100), 1);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(my_malloc(100), 2);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(my_malloc(100), 3);
+
+  // CHECK: store i32 500
+  gi = __builtin_object_size(my_calloc(100, 5), 0);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(my_calloc(100, 5), 1);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(my_calloc(100, 5), 2);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(my_calloc(100, 5), 3);
+
+  void *const zeroPtr = my_malloc(0);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(zeroPtr, 0);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(my_malloc(0), 0);
+
+  void *const zeroArr1 = my_calloc(0, 1);
+  void *const zeroArr2 = my_calloc(1, 0);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(zeroArr1, 0);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(zeroArr2, 0);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(my_calloc(1, 0), 0);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(my_calloc(0, 1), 0);
+}
+
+// CHECK-LABEL: @test2
+void test2() {
+  void *const vp = my_malloc(gi);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(vp, 0);
+
+  void *const arr1 = my_calloc(gi, 1);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr1, 0);
+
+  void *const arr2 = my_calloc(1, gi);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr2, 0);
+}
+
+// CHECK-LABEL: @test3
+void test3() {
+  char *const buf = (char *)my_calloc(100, 5);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(buf, 0);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(buf, 1);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(buf, 2);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(buf, 3);
+}
+
+struct Data {
+  int a;
+  int t[10];
+  char pad[3];
+  char end[1];
+};
+
+// CHECK-LABEL: @test5
+void test5() {
+  struct Data *const data = my_malloc(sizeof(*data));
+  // CHECK: store i32 48
+  gi = __builtin_object_size(data, 0);
+  // CHECK: store i32 48
+  gi = __builtin_object_size(data, 1);
+  // CHECK: store i32 48
+  gi = __builtin_object_size(data, 2);
+  // CHECK: store i32 48
+  gi = __builtin_object_size(data, 3);
+
+  // CHECK: store i32 40
+  gi = __builtin_object_size(&data->t[1], 0);
+  // CHECK: store i32 36
+  gi = __builtin_object_size(&data->t[1], 1);
+  // CHECK: store i32 40
+  gi = __builtin_object_size(&data->t[1], 2);
+  // CHECK: store i32 36
+  gi = __builtin_object_size(&data->t[1], 3);
+
+  struct Data *const arr = my_calloc(sizeof(*data), 2);
+  // CHECK: store i32 96
+  gi = __builtin_object_size(arr, 0);
+  // CHECK: store i32 96
+  gi = __builtin_object_size(arr, 1);
+  // CHECK: store i32 96
+  gi = __builtin_object_size(arr, 2);
+  // CHECK: store i32 96
+  gi = __builtin_object_size(arr, 3);
+
+  // CHECK: store i32 88
+  gi = __builtin_object_size(&arr->t[1], 0);
+  // CHECK: store i32 36
+  gi = __builtin_object_size(&arr->t[1], 1);
+  // CHECK: store i32 88
+  gi = __builtin_object_size(&arr->t[1], 2);
+  // CHECK: store i32 36
+  gi = __builtin_object_size(&arr->t[1], 3);
+}
+
+// CHECK-LABEL: @test6
+void test6() {
+  // Things that would normally trigger conservative estimates don't need to do
+  // so when we know the source of the allocation.
+  struct Data *const data = my_malloc(sizeof(*data) + 10);
+  // CHECK: store i32 11
+  gi = __builtin_object_size(data->end, 0);
+  // CHECK: store i32 11
+  gi = __builtin_object_size(data->end, 1);
+  // CHECK: store i32 11
+  gi = __builtin_object_size(data->end, 2);
+  // CHECK: store i32 11
+  gi = __builtin_object_size(data->end, 3);
+
+  struct Data *const arr = my_calloc(sizeof(*arr) + 5, 3);
+  // AFAICT, GCC treats malloc and calloc identically. So, we should do the
+  // same.
+  //
+  // Additionally, GCC ignores the initial array index when determining whether
+  // we're writing off the end of an alloc_size base. e.g.
+  //   arr[0].end
+  //   arr[1].end
+  //   arr[2].end
+  // ...Are all considered "writing off the end", because there's no way to tell
+  // with high accuracy if the user meant "allocate a single N-byte `Data`",
+  // or "allocate M smaller `Data`s with extra padding".
+
+  // CHECK: store i32 112
+  gi = __builtin_object_size(arr->end, 0);
+  // CHECK: store i32 112
+  gi = __builtin_object_size(arr->end, 1);
+  // CHECK: store i32 112
+  gi = __builtin_object_size(arr->end, 2);
+  // CHECK: store i32 112
+  gi = __builtin_object_size(arr->end, 3);
+
+  // CHECK: store i32 112
+  gi = __builtin_object_size(arr[0].end, 0);
+  // CHECK: store i32 112
+  gi = __builtin_object_size(arr[0].end, 1);
+  // CHECK: store i32 112
+  gi = __builtin_object_size(arr[0].end, 2);
+  // CHECK: store i32 112
+  gi = __builtin_object_size(arr[0].end, 3);
+
+  // CHECK: store i32 64
+  gi = __builtin_object_size(arr[1].end, 0);
+  // CHECK: store i32 64
+  gi = __builtin_object_size(arr[1].end, 1);
+  // CHECK: store i32 64
+  gi = __builtin_object_size(arr[1].end, 2);
+  // CHECK: store i32 64
+  gi = __builtin_object_size(arr[1].end, 3);
+
+  // CHECK: store i32 16
+  gi = __builtin_object_size(arr[2].end, 0);
+  // CHECK: store i32 16
+  gi = __builtin_object_size(arr[2].end, 1);
+  // CHECK: store i32 16
+  gi = __builtin_object_size(arr[2].end, 2);
+  // CHECK: store i32 16
+  gi = __builtin_object_size(arr[2].end, 3);
+}
+
+// CHECK-LABEL: @test7
+void test7() {
+  struct Data *const data = my_malloc(sizeof(*data) + 5);
+  // CHECK: store i32 9
+  gi = __builtin_object_size(data->pad, 0);
+  // CHECK: store i32 3
+  gi = __builtin_object_size(data->pad, 1);
+  // CHECK: store i32 9
+  gi = __builtin_object_size(data->pad, 2);
+  // CHECK: store i32 3
+  gi = __builtin_object_size(data->pad, 3);
+}
+
+// CHECK-LABEL: @test8
+void test8() {
+  // Non-const pointers aren't currently supported.
+  void *buf = my_calloc(100, 5);
+  // CHECK: @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
+  gi = __builtin_object_size(buf, 0);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(buf, 1);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(buf, 2);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(buf, 3);
+}
+
+// CHECK-LABEL: @test9
+void test9() {
+  // Check to be sure that we unwrap things correctly.
+  short *const buf0 = (my_malloc(100));
+  short *const buf1 = (short*)(my_malloc(100));
+  short *const buf2 = ((short*)(my_malloc(100)));
+
+  // CHECK: store i32 100
+  gi = __builtin_object_size(buf0, 0);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(buf1, 0);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(buf2, 0);
+}
+
+// CHECK-LABEL: @test10
+void test10() {
+  // Yay overflow
+  short *const arr = my_calloc((size_t)-1 / 2 + 1, 2);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr, 0);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr, 1);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr, 2);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(arr, 3);
+
+  // As an implementation detail, CharUnits can't handle numbers greater than or
+  // equal to 2**63. Realistically, this shouldn't be a problem, but we should
+  // be sure we don't emit crazy results for this case.
+  short *const buf = my_malloc((size_t)-1);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(buf, 0);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(buf, 1);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(buf, 2);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(buf, 3);
+
+  short *const arr_big = my_calloc((size_t)-1 / 2 - 1, 2);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr_big, 0);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr_big, 1);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr_big, 2);
+  // CHECK: store i32 0
+  gi = __builtin_object_size(arr_big, 3);
+}
+
+void *my_tiny_malloc(char) __attribute__((alloc_size(1)));
+void *my_tiny_calloc(char, char) __attribute__((alloc_size(1, 2)));
+
+// CHECK-LABEL: @test11
+void test11() {
+  void *const vp = my_tiny_malloc(100);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(vp, 0);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(vp, 1);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(vp, 2);
+  // CHECK: store i32 100
+  gi = __builtin_object_size(vp, 3);
+
+  // N.B. This causes char overflow, but not size_t overflow, so it should be
+  // supported.
+  void *const arr = my_tiny_calloc(100, 5);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(arr, 0);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(arr, 1);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(arr, 2);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(arr, 3);
+}
+
+void *my_signed_malloc(long) __attribute__((alloc_size(1)));
+void *my_signed_calloc(long, long) __attribute__((alloc_size(1, 2)));
+
+// CHECK-LABEL: @test12
+void test12() {
+  // CHECK: store i32 100
+  gi = __builtin_object_size(my_signed_malloc(100), 0);
+  // CHECK: store i32 500
+  gi = __builtin_object_size(my_signed_calloc(100, 5), 0);
+
+  void *const vp = my_signed_malloc(-2);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(vp, 0);
+  // N.B. These get lowered to -1 because the function calls may have
+  // side-effects, and we can't determine the objectsize.
+  // CHECK: store i32 -1
+  gi = __builtin_object_size(my_signed_malloc(-2), 0);
+
+  void *const arr1 = my_signed_calloc(-2, 1);
+  void *const arr2 = my_signed_calloc(1, -2);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr1, 0);
+  // CHECK: @llvm.objectsize
+  gi = __builtin_object_size(arr2, 0);
+  // CHECK: store i32 -1
+  gi = __builtin_object_size(my_signed_calloc(1, -2), 0);
+  // CHECK: store i32 -1
+  gi = __builtin_object_size(my_signed_calloc(-2, 1), 0);
+}
diff --git a/test/CodeGen/always_inline.c b/test/CodeGen/always_inline.c
index c91fd43f2761..19d93d9db066 100644
--- a/test/CodeGen/always_inline.c
+++ b/test/CodeGen/always_inline.c
@@ -1,7 +1,7 @@
 // RUN: %clang -emit-llvm -S -o %t %s
 // RUN: not grep '@f0' %t
 // RUN: not grep 'call ' %t
-// RUN: %clang -mllvm -disable-llvm-optzns -emit-llvm -S -o %t %s
+// RUN: %clang -mllvm -disable-llvm-passes -emit-llvm -S -o %t %s
 // RUN: grep '@f0' %t | count 2
 
 //static int f0() { 
diff --git a/test/CodeGen/arm-neon-directed-rounding.c b/test/CodeGen/arm-neon-directed-rounding.c
index 3625e63b17a0..7471b1c230c9 100644
--- a/test/CodeGen/arm-neon-directed-rounding.c
+++ b/test/CodeGen/arm-neon-directed-rounding.c
@@ -3,133 +3,85 @@
 #include <arm_neon.h>
 
 // CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDA_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> [[VRNDA_V_I]]) #2
-// CHECK:   [[VRNDA_V2_I:%.*]] = bitcast <2 x float> [[VRNDA_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDA_V2_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP1]]
+// CHECK:   [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDA_V1_I]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
   return vrnda_f32(a);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDAQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> [[VRNDAQ_V_I]]) #2
-// CHECK:   [[VRNDAQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDAQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDAQ_V2_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDAQ_V1_I]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
   return vrndaq_f32(a);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> [[VRNDM_V_I]]) #2
-// CHECK:   [[VRNDM_V2_I:%.*]] = bitcast <2 x float> [[VRNDM_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDM_V2_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP1]]
+// CHECK:   [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDM_V1_I]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
   return vrndm_f32(a);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> [[VRNDMQ_V_I]]) #2
-// CHECK:   [[VRNDMQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDMQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDMQ_V2_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDMQ_V1_I]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
   return vrndmq_f32(a);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> [[VRNDN_V_I]]) #2
-// CHECK:   [[VRNDN_V2_I:%.*]] = bitcast <2 x float> [[VRNDN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDN_V2_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP1]]
+// CHECK:   [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDN_V1_I]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
   return vrndn_f32(a);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDNQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> [[VRNDNQ_V_I]]) #2
-// CHECK:   [[VRNDNQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDNQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDNQ_V2_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDNQ_V1_I]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
   return vrndnq_f32(a);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDP_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> [[VRNDP_V_I]]) #2
-// CHECK:   [[VRNDP_V2_I:%.*]] = bitcast <2 x float> [[VRNDP_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDP_V2_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP1]]
+// CHECK:   [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDP_V1_I]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
   return vrndp_f32(a);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDPQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> [[VRNDPQ_V_I]]) #2
-// CHECK:   [[VRNDPQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDPQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDPQ_V2_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDPQ_V1_I]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
   return vrndpq_f32(a);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> [[VRNDX_V_I]]) #2
-// CHECK:   [[VRNDX_V2_I:%.*]] = bitcast <2 x float> [[VRNDX_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDX_V2_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP1]]
+// CHECK:   [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDX_V1_I]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
   return vrndx_f32(a);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> [[VRNDXQ_V_I]]) #2
-// CHECK:   [[VRNDXQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDXQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDXQ_V2_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDXQ_V1_I]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
   return vrndxq_f32(a);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRND_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> [[VRND_V_I]]) #2
-// CHECK:   [[VRND_V2_I:%.*]] = bitcast <2 x float> [[VRND_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRND_V2_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP1]]
+// CHECK:   [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRND_V1_I]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
   return vrnd_f32(a);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> [[VRNDQ_V_I]]) #2
-// CHECK:   [[VRNDQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDQ_V2_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDQ_V1_I]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
   return vrndq_f32(a);
 }
diff --git a/test/CodeGen/arm-neon-fma.c b/test/CodeGen/arm-neon-fma.c
index ff6acbcc2e91..9311f6be6834 100644
--- a/test/CodeGen/arm-neon-fma.c
+++ b/test/CodeGen/arm-neon-fma.c
@@ -8,26 +8,14 @@
 #include <arm_neon.h>
 
 // CHECK-LABEL: define <2 x float> @test_fma_order(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %accum to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %lhs to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %rhs to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum) #2
 // CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
   return vfma_f32(accum, lhs, rhs);
 }
 
 // CHECK-LABEL: define <4 x float> @test_fmaq_order(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %accum to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %lhs to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %rhs to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum) #2
 // CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
   return vfmaq_f32(accum, lhs, rhs);
diff --git a/test/CodeGen/arm-neon-numeric-maxmin.c b/test/CodeGen/arm-neon-numeric-maxmin.c
index 6e385b9c49ef..38f020a756df 100644
--- a/test/CodeGen/arm-neon-numeric-maxmin.c
+++ b/test/CodeGen/arm-neon-numeric-maxmin.c
@@ -3,53 +3,29 @@
 #include <arm_neon.h>
 
 // CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMAXNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMAXNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[VMAXNM_V_I]], <2 x float> [[VMAXNM_V1_I]]) #2
-// CHECK:   [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAXNM_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %a, <2 x float> %b) #2
+// CHECK:   ret <2 x float> [[VMAXNM_V2_I]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
   return vmaxnm_f32(a, b);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMAXNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMAXNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[VMAXNMQ_V_I]], <4 x float> [[VMAXNMQ_V1_I]]) #2
-// CHECK:   [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXNMQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %a, <4 x float> %b) #2
+// CHECK:   ret <4 x float> [[VMAXNMQ_V2_I]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
   return vmaxnmq_f32(a, b);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMINNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMINNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[VMINNM_V_I]], <2 x float> [[VMINNM_V1_I]]) #2
-// CHECK:   [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMINNM_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %a, <2 x float> %b) #2
+// CHECK:   ret <2 x float> [[VMINNM_V2_I]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
   return vminnm_f32(a, b);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMINNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMINNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[VMINNMQ_V_I]], <4 x float> [[VMINNMQ_V1_I]]) #2
-// CHECK:   [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINNMQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %a, <4 x float> %b) #2
+// CHECK:   ret <4 x float> [[VMINNMQ_V2_I]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
   return vminnmq_f32(a, b);
 }
diff --git a/test/CodeGen/arm-neon-vcvtX.c b/test/CodeGen/arm-neon-vcvtX.c
index 20cd97c858cb..4ea8fa874e48 100644
--- a/test/CodeGen/arm-neon-vcvtX.c
+++ b/test/CodeGen/arm-neon-vcvtX.c
@@ -3,144 +3,112 @@
 #include <arm_neon.h>
 
 // CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTA_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[VCVTA_S32_V_I]]) #2
+// CHECK:   [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTA_S32_V1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
   return vcvta_s32_f32(a);
 }
 
 // CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTA_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[VCVTA_U32_V_I]]) #2
+// CHECK:   [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTA_U32_V1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
   return vcvta_u32_f32(a);
 }
 
 // CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTAQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[VCVTAQ_S32_V_I]]) #2
+// CHECK:   [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTAQ_S32_V1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
   return vcvtaq_s32_f32(a);
 }
 
 // CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTAQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[VCVTAQ_U32_V_I]]) #2
+// CHECK:   [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTAQ_U32_V1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
   return vcvtaq_u32_f32(a);
 }
 
 // CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTN_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[VCVTN_S32_V_I]]) #2
+// CHECK:   [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTN_S32_V1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
   return vcvtn_s32_f32(a);
 }
 
 // CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTN_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_U32_V_I]]) #2
+// CHECK:   [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTN_U32_V1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
   return vcvtn_u32_f32(a);
 }
 
 // CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTNQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[VCVTNQ_S32_V_I]]) #2
+// CHECK:   [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTNQ_S32_V1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
   return vcvtnq_s32_f32(a);
 }
 
 // CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTNQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[VCVTNQ_U32_V_I]]) #2
+// CHECK:   [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTNQ_U32_V1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
   return vcvtnq_u32_f32(a);
 }
 
 // CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTP_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[VCVTP_S32_V_I]]) #2
+// CHECK:   [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTP_S32_V1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
   return vcvtp_s32_f32(a);
 }
 
 // CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTP_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_U32_V_I]]) #2
+// CHECK:   [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTP_U32_V1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
   return vcvtp_u32_f32(a);
 }
 
 // CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTPQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[VCVTPQ_S32_V_I]]) #2
+// CHECK:   [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTPQ_S32_V1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
   return vcvtpq_s32_f32(a);
 }
 
 // CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTPQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[VCVTPQ_U32_V_I]]) #2
+// CHECK:   [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTPQ_U32_V1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
   return vcvtpq_u32_f32(a);
 }
 
 // CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTM_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[VCVTM_S32_V_I]]) #2
+// CHECK:   [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTM_S32_V1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
   return vcvtm_s32_f32(a);
 }
 
 // CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTM_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_U32_V_I]]) #2
+// CHECK:   [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a) #2
 // CHECK:   ret <2 x i32> [[VCVTM_U32_V1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
   return vcvtm_u32_f32(a);
 }
 
 // CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTMQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[VCVTMQ_S32_V_I]]) #2
+// CHECK:   [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTMQ_S32_V1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
   return vcvtmq_s32_f32(a);
 }
 
 // CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTMQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[VCVTMQ_U32_V_I]]) #2
+// CHECK:   [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a) #2
 // CHECK:   ret <4 x i32> [[VCVTMQ_U32_V1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
   return vcvtmq_u32_f32(a);
diff --git a/test/CodeGen/arm-swiftcall.c b/test/CodeGen/arm-swiftcall.c
index d54a31337085..5a7e17068b7c 100644
--- a/test/CodeGen/arm-swiftcall.c
+++ b/test/CodeGen/arm-swiftcall.c
@@ -1,7 +1,6 @@
 // RUN: %clang_cc1 -triple armv7-apple-darwin9 -emit-llvm -o - %s | FileCheck %s
-
-// This isn't really testing anything ARM-specific; it's just a convenient
-// 32-bit platform.
+// RUN: %clang_cc1 -triple armv7s-apple-ios9 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple armv7k-apple-ios9 -emit-llvm -o - %s | FileCheck %s
 
 #define SWIFTCALL __attribute__((swiftcall))
 #define OUT __attribute__((swift_indirect_result))
@@ -66,6 +65,9 @@ typedef int int3 __attribute__((ext_vector_type(3)));
 typedef int int4 __attribute__((ext_vector_type(4)));
 typedef int int5 __attribute__((ext_vector_type(5)));
 typedef int int8 __attribute__((ext_vector_type(8)));
+typedef char char16 __attribute__((ext_vector_type(16)));
+typedef short short8 __attribute__((ext_vector_type(8)));
+typedef long long long2 __attribute__((ext_vector_type(2)));
 
 #define TEST(TYPE)                       \
   SWIFTCALL TYPE return_##TYPE(void) {   \
@@ -278,41 +280,41 @@ typedef union {
 } union_het_fp;
 TEST(union_het_fp)
 // CHECK-LABEL: define {{.*}} @return_union_het_fp()
-// CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align {{(4|8)}}
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align {{(4|8)}}
 // CHECK:   @llvm.memcpy
 // CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i32 }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
-// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align {{(4|8)}}
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
-// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align {{(4|8)}}
 // CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ i32, i32 }]] undef, i32 [[FIRST]], 0
 // CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i32 [[SECOND]], 1
 // CHECK:   ret [[UAGG]] [[T1]]
 // CHECK-LABEL: define {{.*}} @take_union_het_fp(i32, i32)
-// CHECK:   [[V:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[V:%.*]] = alloca [[REC]], align {{(4|8)}}
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
-// CHECK:   store i32 %0, i32* [[T0]], align 4
+// CHECK:   store i32 %0, i32* [[T0]], align {{(4|8)}}
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
-// CHECK:   store i32 %1, i32* [[T0]], align 4
+// CHECK:   store i32 %1, i32* [[T0]], align {{(4|8)}}
 // CHECK:   ret void
 // CHECK-LABEL: define void @test_union_het_fp()
-// CHECK:   [[TMP:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[TMP:%.*]] = alloca [[REC]], align {{(4|8)}}
 // CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_union_het_fp()
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
 // CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
-// CHECK:   store i32 [[T1]], i32* [[T0]], align 4
+// CHECK:   store i32 [[T1]], i32* [[T0]], align {{(4|8)}}
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
 // CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
-// CHECK:   store i32 [[T1]], i32* [[T0]], align 4
+// CHECK:   store i32 [[T1]], i32* [[T0]], align {{(4|8)}}
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
-// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align {{(4|8)}}
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
-// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align {{(4|8)}}
 // CHECK:   call [[SWIFTCC]] void @take_union_het_fp(i32 [[FIRST]], i32 [[SECOND]])
 // CHECK:   ret void
 
@@ -502,3 +504,512 @@ typedef struct {
 } misaligned_int3;
 TEST(misaligned_int3)
 // CHECK-LABEL: define {{.*}} @take_misaligned_int3(i32, i32, i32, i32)
+
+typedef struct {
+  float f0;
+} struct_f1;
+TEST(struct_f1)
+// CHECK-LABEL: define swiftcc float @return_struct_f1()
+// CHECK-LABEL: define swiftcc void @take_struct_f1(float)
+
+typedef struct {
+  float f0;
+  float f1;
+} struct_f2;
+TEST(struct_f2)
+// CHECK-LABEL: define swiftcc { float, float } @return_struct_f2()
+// CHECK-LABEL: define swiftcc void @take_struct_f2(float, float)
+
+typedef struct {
+  float f0;
+  float f1;
+  float f2;
+} struct_f3;
+TEST(struct_f3)
+// CHECK-LABEL: define swiftcc { float, float, float } @return_struct_f3()
+// CHECK-LABEL: define swiftcc void @take_struct_f3(float, float, float)
+
+typedef struct {
+  float f0;
+  float f1;
+  float f2;
+  float f3;
+} struct_f4;
+TEST(struct_f4)
+// CHECK-LABEL: define swiftcc { float, float, float, float } @return_struct_f4()
+// CHECK-LABEL: define swiftcc void @take_struct_f4(float, float, float, float)
+
+
+typedef struct {
+  double d0;
+} struct_d1;
+TEST(struct_d1)
+// CHECK-LABEL: define swiftcc double @return_struct_d1()
+// CHECK-LABEL: define swiftcc void @take_struct_d1(double)
+
+typedef struct {
+  double d0;
+  double d1;
+} struct_d2;
+TEST(struct_d2)
+// CHECK-LABEL: define swiftcc { double, double } @return_struct_d2()
+// CHECK-LABEL: define swiftcc void @take_struct_d2(double, double)
+
+typedef struct {
+  double d0;
+  double d1;
+  double d2;
+} struct_d3;
+TEST(struct_d3)
+// CHECK-LABEL: define swiftcc { double, double, double } @return_struct_d3()
+// CHECK-LABEL: define swiftcc void @take_struct_d3(double, double, double)
+
+typedef struct {
+  double d0;
+  double d1;
+  double d2;
+  double d3;
+} struct_d4;
+TEST(struct_d4)
+// CHECK-LABEL: define swiftcc { double, double, double, double } @return_struct_d4()
+// CHECK-LABEL: define swiftcc void @take_struct_d4(double, double, double, double)
+
+typedef struct {
+  double d0;
+  double d1;
+  double d2;
+  double d3;
+  double d4;
+} struct_d5;
+TEST(struct_d5)
+// CHECK: define swiftcc void @return_struct_d5([[STRUCT5:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_d5([[STRUCT5]]
+
+typedef struct {
+  char c0;
+} struct_c1;
+TEST(struct_c1)
+// CHECK-LABEL: define swiftcc i8 @return_struct_c1()
+// CHECK-LABEL: define swiftcc void @take_struct_c1(i8)
+
+typedef struct {
+  char c0;
+  char c1;
+} struct_c2;
+TEST(struct_c2)
+// CHECK-LABEL: define swiftcc i16 @return_struct_c2()
+// CHECK-LABEL: define swiftcc void @take_struct_c2(i16)
+//
+
+typedef struct {
+  char c0;
+  char c1;
+  char c2;
+} struct_c3;
+TEST(struct_c3)
+// CHECK-LABEL: define swiftcc i32 @return_struct_c3()
+// CHECK-LABEL: define swiftcc void @take_struct_c3(i32)
+
+typedef struct {
+  char c0;
+  char c1;
+  char c2;
+  char c3;
+} struct_c4;
+TEST(struct_c4)
+// CHECK-LABEL: define swiftcc i32 @return_struct_c4()
+// CHECK-LABEL: define swiftcc void @take_struct_c4(i32)
+
+typedef struct {
+  char c0;
+  char c1;
+  char c2;
+  char c3;
+  char c4;
+} struct_c5;
+TEST(struct_c5)
+// CHECK-LABEL: define swiftcc { i32, i8 } @return_struct_c5()
+// CHECK-LABEL: define swiftcc void @take_struct_c5(i32, i8)
+
+typedef struct {
+  short s0;
+} struct_s1;
+TEST(struct_s1)
+// CHECK-LABEL: define swiftcc i16 @return_struct_s1()
+// CHECK-LABEL: define swiftcc void @take_struct_s1(i16)
+
+typedef struct {
+  short s0;
+  short s1;
+} struct_s2;
+TEST(struct_s2)
+// CHECK-LABEL: define swiftcc i32 @return_struct_s2()
+// CHECK-LABEL: define swiftcc void @take_struct_s2(i32)
+
+typedef struct {
+  short s0;
+  short s1;
+  short s2;
+} struct_s3;
+TEST(struct_s3)
+// CHECK-LABEL: define swiftcc { i32, i16 } @return_struct_s3()
+// CHECK-LABEL: define swiftcc void @take_struct_s3(i32, i16)
+
+typedef struct {
+  short s0;
+  short s1;
+  short s2;
+  short s3;
+} struct_s4;
+TEST(struct_s4)
+// CHECK-LABEL: define swiftcc { i32, i32 } @return_struct_s4()
+// CHECK-LABEL: define swiftcc void @take_struct_s4(i32, i32)
+
+typedef struct {
+  short s0;
+  short s1;
+  short s2;
+  short s3;
+  short s4;
+} struct_s5;
+TEST(struct_s5)
+// CHECK-LABEL: define swiftcc { i32, i32, i16 } @return_struct_s5()
+// CHECK-LABEL: define swiftcc void @take_struct_s5(i32, i32, i16)
+
+
+typedef struct {
+  int i0;
+} struct_i1;
+TEST(struct_i1)
+// CHECK-LABEL: define swiftcc i32 @return_struct_i1()
+// CHECK-LABEL: define swiftcc void @take_struct_i1(i32)
+
+typedef struct {
+  int i0;
+  int i1;
+} struct_i2;
+TEST(struct_i2)
+// CHECK-LABEL: define swiftcc { i32, i32 } @return_struct_i2()
+// CHECK-LABEL: define swiftcc void @take_struct_i2(i32, i32)
+
+typedef struct {
+  int i0;
+  int i1;
+  int i2;
+} struct_i3;
+TEST(struct_i3)
+// CHECK-LABEL: define swiftcc { i32, i32, i32 } @return_struct_i3()
+// CHECK-LABEL: define swiftcc void @take_struct_i3(i32, i32, i32)
+
+typedef struct {
+  int i0;
+  int i1;
+  int i2;
+  int i3;
+} struct_i4;
+TEST(struct_i4)
+// CHECK-LABEL: define swiftcc { i32, i32, i32, i32 } @return_struct_i4()
+// CHECK-LABEL: define swiftcc void @take_struct_i4(i32, i32, i32, i32)
+
+typedef struct {
+  long long l0;
+} struct_l1;
+TEST(struct_l1)
+// CHECK-LABEL: define swiftcc i64 @return_struct_l1()
+// CHECK-LABEL: define swiftcc void @take_struct_l1(i64)
+
+typedef struct {
+  long long l0;
+  long long l1;
+} struct_l2;
+TEST(struct_l2)
+// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_l2()
+// CHECK-LABEL: define swiftcc void @take_struct_l2(i64, i64)
+
+typedef struct {
+  long long l0;
+  long long l1;
+  long long l2;
+} struct_l3;
+TEST(struct_l3)
+// CHECK: define swiftcc void @return_struct_l3([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_l3([[STRUCT]]
+
+typedef struct {
+  long long l0;
+  long long l1;
+  long long l2;
+  long long l3;
+} struct_l4;
+TEST(struct_l4)
+// CHECK: define swiftcc void @return_struct_l4([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_l4([[STRUCT]]
+
+typedef struct {
+  long long l0;
+  long long l1;
+  long long l2;
+  long long l3;
+  long long l4;
+} struct_l5;
+TEST(struct_l5)
+// CHECK: define swiftcc void @return_struct_l5([[STRUCT5:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_l5([[STRUCT5]]*
+
+typedef struct {
+  char16 c0;
+} struct_vc1;
+TEST(struct_vc1)
+// CHECK-LABEL: define swiftcc <16 x i8> @return_struct_vc1()
+// CHECK-LABEL: define swiftcc void @take_struct_vc1(<16 x i8>)
+
+typedef struct {
+  char16 c0;
+  char16 c1;
+} struct_vc2;
+TEST(struct_vc2)
+// CHECK-LABEL: define swiftcc { <16 x i8>, <16 x i8> } @return_struct_vc2()
+// CHECK-LABEL: define swiftcc void @take_struct_vc2(<16 x i8>, <16 x i8>)
+
+typedef struct {
+  char16 c0;
+  char16 c1;
+  char16 c2;
+} struct_vc3;
+TEST(struct_vc3)
+// CHECK-LABEL: define swiftcc { <16 x i8>, <16 x i8>, <16 x i8> } @return_struct_vc3()
+// CHECK-LABEL: define swiftcc void @take_struct_vc3(<16 x i8>, <16 x i8>, <16 x i8>)
+
+typedef struct {
+  char16 c0;
+  char16 c1;
+  char16 c2;
+  char16 c3;
+} struct_vc4;
+TEST(struct_vc4)
+// CHECK-LABEL: define swiftcc { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @return_struct_vc4()
+// CHECK-LABEL: define swiftcc void @take_struct_vc4(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+typedef struct {
+  char16 c0;
+  char16 c1;
+  char16 c2;
+  char16 c3;
+  char16 c4;
+} struct_vc5;
+TEST(struct_vc5)
+// CHECK: define swiftcc void @return_struct_vc5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vc5([[STRUCT]]
+
+typedef struct {
+  short8 c0;
+} struct_vs1;
+TEST(struct_vs1)
+// CHECK-LABEL: define swiftcc <8 x i16> @return_struct_vs1()
+// CHECK-LABEL: define swiftcc void @take_struct_vs1(<8 x i16>)
+
+typedef struct {
+  short8 c0;
+  short8 c1;
+} struct_vs2;
+TEST(struct_vs2)
+// CHECK-LABEL: define swiftcc { <8 x i16>, <8 x i16> } @return_struct_vs2()
+// CHECK-LABEL: define swiftcc void @take_struct_vs2(<8 x i16>, <8 x i16>)
+
+typedef struct {
+  short8 c0;
+  short8 c1;
+  short8 c2;
+} struct_vs3;
+TEST(struct_vs3)
+// CHECK-LABEL: define swiftcc { <8 x i16>, <8 x i16>, <8 x i16> } @return_struct_vs3()
+// CHECK-LABEL: define swiftcc void @take_struct_vs3(<8 x i16>, <8 x i16>, <8 x i16>)
+
+typedef struct {
+  short8 c0;
+  short8 c1;
+  short8 c2;
+  short8 c3;
+} struct_vs4;
+TEST(struct_vs4)
+// CHECK-LABEL: define swiftcc { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @return_struct_vs4()
+// CHECK-LABEL: define swiftcc void @take_struct_vs4(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>)
+
+typedef struct {
+  short8 c0;
+  short8 c1;
+  short8 c2;
+  short8 c3;
+  short8 c4;
+} struct_vs5;
+TEST(struct_vs5)
+// CHECK: define swiftcc void @return_struct_vs5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vs5([[STRUCT]]
+
+typedef struct {
+  int4 c0;
+} struct_vi1;
+TEST(struct_vi1)
+// CHECK-LABEL: define swiftcc <4 x i32> @return_struct_vi1()
+// CHECK-LABEL: define swiftcc void @take_struct_vi1(<4 x i32>)
+
+typedef struct {
+  int4 c0;
+  int4 c1;
+} struct_vi2;
+TEST(struct_vi2)
+// CHECK-LABEL: define swiftcc { <4 x i32>, <4 x i32> } @return_struct_vi2()
+// CHECK-LABEL: define swiftcc void @take_struct_vi2(<4 x i32>, <4 x i32>)
+
+typedef struct {
+  int4 c0;
+  int4 c1;
+  int4 c2;
+} struct_vi3;
+TEST(struct_vi3)
+// CHECK-LABEL: define swiftcc { <4 x i32>, <4 x i32>, <4 x i32> } @return_struct_vi3()
+// CHECK-LABEL: define swiftcc void @take_struct_vi3(<4 x i32>, <4 x i32>, <4 x i32>)
+
+typedef struct {
+  int4 c0;
+  int4 c1;
+  int4 c2;
+  int4 c3;
+} struct_vi4;
+TEST(struct_vi4)
+// CHECK-LABEL: define swiftcc { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @return_struct_vi4()
+// CHECK-LABEL: define swiftcc void @take_struct_vi4(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>)
+
+typedef struct {
+  int4 c0;
+  int4 c1;
+  int4 c2;
+  int4 c3;
+  int4 c4;
+} struct_vi5;
+TEST(struct_vi5)
+// CHECK: define swiftcc void @return_struct_vi5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vi5([[STRUCT]]
+
+typedef struct {
+  long2 c0;
+} struct_vl1;
+TEST(struct_vl1)
+// CHECK-LABEL: define swiftcc <2 x i64> @return_struct_vl1()
+// CHECK-LABEL: define swiftcc void @take_struct_vl1(<2 x i64>)
+
+typedef struct {
+  long2 c0;
+  long2 c1;
+  long2 c2;
+  long2 c3;
+} struct_vl4;
+TEST(struct_vl4)
+// CHECK-LABEL: define swiftcc { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @return_struct_vl4()
+// CHECK-LABEL: define swiftcc void @take_struct_vl4(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
+
+typedef struct {
+  long2 c0;
+  long2 c1;
+  long2 c2;
+  long2 c3;
+  long2 c4;
+} struct_vl5;
+TEST(struct_vl5)
+// CHECK: define swiftcc void @return_struct_vl5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vl5([[STRUCT]]
+
+typedef struct {
+  double2 c0;
+} struct_vd1;
+TEST(struct_vd1)
+// CHECK-LABEL: define swiftcc <2 x double> @return_struct_vd1()
+// CHECK-LABEL: define swiftcc void @take_struct_vd1(<2 x double>)
+
+typedef struct {
+  double2 c0;
+  double2 c1;
+  double2 c2;
+  double2 c3;
+} struct_vd4;
+TEST(struct_vd4)
+// CHECK-LABEL: define swiftcc { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @return_struct_vd4()
+// CHECK-LABEL: define swiftcc void @take_struct_vd4(<2 x double>, <2 x double>, <2 x double>, <2 x double>)
+
+typedef struct {
+  double2 c0;
+  double2 c1;
+  double2 c2;
+  double2 c3;
+  double2 c4;
+} struct_vd5;
+TEST(struct_vd5)
+// CHECK: define swiftcc void @return_struct_vd5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vd5([[STRUCT]]
+
+typedef struct {
+  double4 c0;
+} struct_vd41;
+TEST(struct_vd41)
+// CHECK-LABEL: define swiftcc { <2 x double>, <2 x double> } @return_struct_vd41()
+// CHECK-LABEL: define swiftcc void @take_struct_vd41(<2 x double>, <2 x double>)
+
+typedef struct {
+  double4 c0;
+  double4 c1;
+} struct_vd42;
+TEST(struct_vd42)
+// CHECK-LABEL: define swiftcc { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @return_struct_vd42()
+// CHECK-LABEL: define swiftcc void @take_struct_vd42(<2 x double>, <2 x double>, <2 x double>, <2 x double>)
+
+typedef struct {
+  double4 c0;
+  double4 c1;
+  double4 c2;
+} struct_vd43;
+TEST(struct_vd43)
+// CHECK: define swiftcc void @return_struct_vd43([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vd43([[STRUCT]]
+
+typedef struct {
+  float4 c0;
+} struct_vf1;
+TEST(struct_vf1)
+// CHECK-LABEL: define swiftcc <4 x float> @return_struct_vf1()
+// CHECK-LABEL: define swiftcc void @take_struct_vf1(<4 x float>)
+
+typedef struct {
+  float4 c0;
+  float4 c1;
+} struct_vf2;
+TEST(struct_vf2)
+// CHECK-LABEL: define swiftcc { <4 x float>, <4 x float> } @return_struct_vf2()
+// CHECK-LABEL: define swiftcc void @take_struct_vf2(<4 x float>, <4 x float>)
+
+typedef struct {
+  float4 c0;
+  float4 c1;
+  float4 c2;
+  float4 c3;
+} struct_vf4;
+TEST(struct_vf4)
+// CHECK-LABEL: define swiftcc { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @return_struct_vf4()
+// CHECK-LABEL: define swiftcc void @take_struct_vf4(<4 x float>, <4 x float>, <4 x float>, <4 x float>)
+
+typedef struct {
+  float4 c0;
+  float4 c1;
+  float4 c2;
+  float4 c3;
+  float4 c4;
+} struct_vf5;
+TEST(struct_vf5)
+// CHECK: define swiftcc void @return_struct_vf5([[STRUCT:%.*]]* noalias sret
+// CHECK: define swiftcc void @take_struct_vf5([[STRUCT]]
+
+typedef struct {
+  float8 c0;
+} struct_vf81;
+TEST(struct_vf81)
+// CHECK-LABEL: define swiftcc { <4 x float>, <4 x float> } @return_struct_vf81()
+// CHECK-LABEL: define swiftcc void @take_struct_vf81(<4 x float>, <4 x float>)
diff --git a/test/CodeGen/arm-target-features.c b/test/CodeGen/arm-target-features.c
index 189c6f701059..7437fb0e4ed2 100644
--- a/test/CodeGen/arm-target-features.c
+++ b/test/CodeGen/arm-target-features.c
@@ -29,6 +29,8 @@
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a72 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a73 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m2 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // CHECK-BASIC-V8: "target-features"="+crc,+crypto,+dsp,+fp-armv8,+hwdiv,+hwdiv-arm,+neon"
 
 
diff --git a/test/CodeGen/arm64_crypto.c b/test/CodeGen/arm64_crypto.c
index e63009a3109c..c8bd4f978eb9 100644
--- a/test/CodeGen/arm64_crypto.c
+++ b/test/CodeGen/arm64_crypto.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon -target-feature +crypto -ffreestanding -Os -S -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon -target-feature +crypto -ffreestanding -fexperimental-new-pass-manager -Os -S -o - %s | FileCheck %s
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index 4cc7eedffd5b..ad8587b0db2a 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -6,7 +6,7 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vaba_s8(
 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
 // CHECK:   ret <8 x i8> [[ADD_I]]
@@ -14,35 +14,29 @@ int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vaba_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vaba_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
 // CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vaba_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vaba_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
 // CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vaba_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vaba_u8(
 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
 // CHECK:   ret <8 x i8> [[ADD_I]]
@@ -50,35 +44,29 @@ uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vaba_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vaba_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
 // CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vaba_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vaba_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
 // CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vaba_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vabaq_s8(
 // CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
 // CHECK:   ret <16 x i8> [[ADD_I]]
@@ -86,35 +74,29 @@ int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vabaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vabaq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
-// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %b, <8 x i16> %c) #4
 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vabaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vabaq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
-// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %b, <4 x i32> %c) #4
 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vabaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vabaq_u8(
 // CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
 // CHECK:   ret <16 x i8> [[ADD_I]]
@@ -122,36 +104,29 @@ uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vabaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vabaq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
-// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %b, <8 x i16> %c) #4
 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vabaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vabaq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
-// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %b, <4 x i32> %c) #4
 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vabaq_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vabal_s8(
 // CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
@@ -160,41 +135,33 @@ int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vabal_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vabal_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vabal_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vabal_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vabal_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vabal_u8(
 // CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
@@ -203,201 +170,161 @@ uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vabal_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vabal_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vabal_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vabal_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vabal_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vabd_s8(
 // CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VABD_V_I]]
 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
   return vabd_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vabd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VABD_V2_I]]
 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
   return vabd_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vabd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VABD_V2_I]]
 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
   return vabd_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vabd_u8(
 // CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VABD_V_I]]
 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
   return vabd_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vabd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VABD_V2_I]]
 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
   return vabd_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vabd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VABD_V2_I]]
 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
   return vabd_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vabd_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[VABD_V_I]], <2 x float> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VABD_V2_I]]
 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
   return vabd_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vabdq_s8(
 // CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VABDQ_V_I]]
 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
   return vabdq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vabdq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VABDQ_V2_I]]
 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
   return vabdq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vabdq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VABDQ_V2_I]]
 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
   return vabdq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vabdq_u8(
 // CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VABDQ_V_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
   return vabdq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vabdq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VABDQ_V2_I]]
 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
   return vabdq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vabdq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VABDQ_V2_I]]
 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
   return vabdq_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vabdq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[VABDQ_V_I]], <4 x float> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   ret <4 x float> [[VABDQ_V2_I]]
 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
   return vabdq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vabdl_s8(
 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
@@ -405,39 +332,31 @@ int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   return vabdl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vabdl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   return vabdl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vabdl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   return vabdl_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vabdl_u8(
 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
@@ -445,241 +364,222 @@ uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   return vabdl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vabdl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   return vabdl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vabdl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   return vabdl_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vabs_s8(
 // CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
   return vabs_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vabs_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[VABS_I]]) #4
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4
 // CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
   return vabs_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vabs_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[VABS_I]]) #4
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4
 // CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
   return vabs_s32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vabs_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #4
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4
 // CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
   return vabs_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vabsq_s8(
 // CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
   return vabsq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vabsq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[VABS_I]]) #4
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4
 // CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
   return vabsq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vabsq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[VABS_I]]) #4
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4
 // CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
   return vabsq_s32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vabsq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #4
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4
 // CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
   return vabsq_f32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vadd_s8(
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
   return vadd_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vadd_s16(
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
   return vadd_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vadd_s32(
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
   return vadd_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vadd_s64(
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
   return vadd_s64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vadd_f32(
 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
 // CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
   return vadd_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vadd_u8(
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
   return vadd_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vadd_u16(
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
   return vadd_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vadd_u32(
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
   return vadd_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vadd_u64(
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
   return vadd_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddq_s8(
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
   return vaddq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddq_s16(
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
   return vaddq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddq_s32(
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
   return vaddq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vaddq_s64(
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
   return vaddq_s64(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vaddq_f32(
 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
 // CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
   return vaddq_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddq_u8(
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vaddq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddq_u16(
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vaddq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddq_u32(
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vaddq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vaddq_u64(
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vaddq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
@@ -687,12 +587,10 @@ int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   return vaddhn_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
@@ -700,12 +598,10 @@ int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   return vaddhn_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
@@ -713,12 +609,10 @@ int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   return vaddhn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
@@ -726,12 +620,10 @@ uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vaddhn_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
@@ -739,12 +631,10 @@ uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vaddhn_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vaddhn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
@@ -752,8 +642,7 @@ uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vaddhn_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddl_s8(
 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
@@ -762,33 +651,29 @@ int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   return vaddl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   return vaddl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   return vaddl_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddl_u8(
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
@@ -797,34 +682,29 @@ uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   return vaddl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   return vaddl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   return vaddl_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddw_s8(
 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -832,27 +712,25 @@ int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   return vaddw_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddw_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   return vaddw_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddw_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   return vaddw_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vaddw_u8(
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -860,141 +738,137 @@ uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   return vaddw_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vaddw_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   return vaddw_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vaddw_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   return vaddw_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vand_s8(
 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
   return vand_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vand_s16(
 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
   return vand_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vand_s32(
 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
   return vand_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vand_s64(
 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
   return vand_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vand_u8(
 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
   return vand_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vand_u16(
 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
   return vand_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vand_u32(
 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
   return vand_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vand_u64(
 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
   return vand_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vandq_s8(
 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
   return vandq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vandq_s16(
 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
   return vandq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vandq_s32(
 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
   return vandq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vandq_s64(
 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
   return vandq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vandq_u8(
 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
   return vandq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vandq_u16(
 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
   return vandq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vandq_u32(
 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
   return vandq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vandq_u64(
 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
   return vandq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vbic_s8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
 // CHECK:   ret <8 x i8> [[AND_I]]
@@ -1002,7 +876,7 @@ int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
   return vbic_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vbic_s16(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
 // CHECK:   ret <4 x i16> [[AND_I]]
@@ -1010,7 +884,7 @@ int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
   return vbic_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vbic_s32(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
 // CHECK:   ret <2 x i32> [[AND_I]]
@@ -1018,7 +892,7 @@ int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
   return vbic_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vbic_s64(
 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
 // CHECK:   ret <1 x i64> [[AND_I]]
@@ -1026,7 +900,7 @@ int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
   return vbic_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vbic_u8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
 // CHECK:   ret <8 x i8> [[AND_I]]
@@ -1034,7 +908,7 @@ uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
   return vbic_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vbic_u16(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
 // CHECK:   ret <4 x i16> [[AND_I]]
@@ -1042,7 +916,7 @@ uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
   return vbic_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vbic_u32(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
 // CHECK:   ret <2 x i32> [[AND_I]]
@@ -1050,7 +924,7 @@ uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
   return vbic_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vbic_u64(
 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
 // CHECK:   ret <1 x i64> [[AND_I]]
@@ -1058,7 +932,7 @@ uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
   return vbic_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vbicq_s8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
 // CHECK:   ret <16 x i8> [[AND_I]]
@@ -1066,7 +940,7 @@ int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
   return vbicq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vbicq_s16(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
 // CHECK:   ret <8 x i16> [[AND_I]]
@@ -1074,7 +948,7 @@ int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
   return vbicq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vbicq_s32(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
 // CHECK:   ret <4 x i32> [[AND_I]]
@@ -1082,7 +956,7 @@ int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
   return vbicq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vbicq_s64(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
 // CHECK:   ret <2 x i64> [[AND_I]]
@@ -1090,7 +964,7 @@ int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
   return vbicq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vbicq_u8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
 // CHECK:   ret <16 x i8> [[AND_I]]
@@ -1098,7 +972,7 @@ uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
   return vbicq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vbicq_u16(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
 // CHECK:   ret <8 x i16> [[AND_I]]
@@ -1106,7 +980,7 @@ uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
   return vbicq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vbicq_u32(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
 // CHECK:   ret <4 x i32> [[AND_I]]
@@ -1114,7 +988,7 @@ uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
   return vbicq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vbicq_u64(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
 // CHECK:   ret <2 x i64> [[AND_I]]
@@ -1122,15 +996,14 @@ uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
   return vbicq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vbsl_s8(
 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
   return vbsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vbsl_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vbsl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
@@ -1141,7 +1014,7 @@ int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
   return vbsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vbsl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
@@ -1152,7 +1025,7 @@ int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
   return vbsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK-LABEL: @test_vbsl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
@@ -1163,14 +1036,14 @@ int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
   return vbsl_s64(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vbsl_u8(
 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vbsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vbsl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
@@ -1181,7 +1054,7 @@ uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vbsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vbsl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
@@ -1192,7 +1065,7 @@ uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vbsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK-LABEL: @test_vbsl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
@@ -1203,7 +1076,7 @@ uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
   return vbsl_u64(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x i32> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vbsl_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
@@ -1214,14 +1087,14 @@ float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
   return vbsl_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vbsl_p8(
 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
   return vbsl_p8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vbsl_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
@@ -1232,14 +1105,14 @@ poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
   return vbsl_p16(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vbslq_s8(
 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
   return vbslq_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vbslq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
@@ -1250,7 +1123,7 @@ int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
   return vbslq_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vbslq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
@@ -1261,7 +1134,7 @@ int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
   return vbslq_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK-LABEL: @test_vbslq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
@@ -1272,14 +1145,14 @@ int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
   return vbslq_s64(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vbslq_u8(
 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vbslq_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vbslq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
@@ -1290,7 +1163,7 @@ uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vbslq_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vbslq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
@@ -1301,7 +1174,7 @@ uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vbslq_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK-LABEL: @test_vbslq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
@@ -1312,7 +1185,7 @@ uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
   return vbslq_u64(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK-LABEL: @test_vbslq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
@@ -1323,14 +1196,14 @@ float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
   return vbslq_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vbslq_p8(
 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
   return vbslq_p8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vbslq_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
@@ -1341,100 +1214,79 @@ poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
   return vbslq_p16(a, b, c);
 }
 
-
-// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vcage_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
   return vcage_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcageq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
   return vcageq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vcagt_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
   return vcagt_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcagtq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
   return vcagtq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vcale_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %b, <2 x float> %a) #4
 // CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
   return vcale_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcaleq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %b, <4 x float> %a) #4
 // CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
   return vcaleq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vcalt_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %b, <2 x float> %a) #4
 // CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
   return vcalt_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcaltq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %b, <4 x float> %a) #4
 // CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
   return vcaltq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vceq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1442,7 +1294,7 @@ uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
   return vceq_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vceq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1450,7 +1302,7 @@ uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
   return vceq_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vceq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1458,7 +1310,7 @@ uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
   return vceq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vceq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1466,7 +1318,7 @@ uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
   return vceq_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vceq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1474,7 +1326,7 @@ uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
   return vceq_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vceq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1482,7 +1334,7 @@ uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
   return vceq_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vceq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1490,7 +1342,7 @@ uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
   return vceq_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vceq_p8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1498,7 +1350,7 @@ uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
   return vceq_p8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vceqq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1506,7 +1358,7 @@ uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
   return vceqq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vceqq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1514,7 +1366,7 @@ uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
   return vceqq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vceqq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1522,7 +1374,7 @@ uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
   return vceqq_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vceqq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1530,7 +1382,7 @@ uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
   return vceqq_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vceqq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1538,7 +1390,7 @@ uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
   return vceqq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vceqq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1546,7 +1398,7 @@ uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
   return vceqq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vceqq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1554,7 +1406,7 @@ uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
   return vceqq_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vceqq_p8(
 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1562,8 +1414,7 @@ uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
   return vceqq_p8(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcge_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1571,7 +1422,7 @@ uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
   return vcge_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcge_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1579,7 +1430,7 @@ uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
   return vcge_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcge_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1587,7 +1438,7 @@ uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
   return vcge_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vcge_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1595,7 +1446,7 @@ uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
   return vcge_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcge_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1603,7 +1454,7 @@ uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
   return vcge_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcge_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1611,7 +1462,7 @@ uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
   return vcge_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcge_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1619,7 +1470,7 @@ uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
   return vcge_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcgeq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1627,7 +1478,7 @@ uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
   return vcgeq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcgeq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1635,7 +1486,7 @@ uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
   return vcgeq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcgeq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1643,7 +1494,7 @@ uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
   return vcgeq_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcgeq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1651,7 +1502,7 @@ uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
   return vcgeq_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcgeq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1659,7 +1510,7 @@ uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgeq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcgeq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1667,7 +1518,7 @@ uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgeq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcgeq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1675,8 +1526,7 @@ uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgeq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcgt_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1684,7 +1534,7 @@ uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
   return vcgt_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcgt_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1692,7 +1542,7 @@ uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
   return vcgt_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcgt_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1700,7 +1550,7 @@ uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
   return vcgt_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vcgt_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1708,7 +1558,7 @@ uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
   return vcgt_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcgt_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1716,7 +1566,7 @@ uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
   return vcgt_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcgt_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1724,7 +1574,7 @@ uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
   return vcgt_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcgt_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1732,7 +1582,7 @@ uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
   return vcgt_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcgtq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1740,7 +1590,7 @@ uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
   return vcgtq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcgtq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1748,7 +1598,7 @@ uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
   return vcgtq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcgtq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1756,7 +1606,7 @@ uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
   return vcgtq_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcgtq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1764,7 +1614,7 @@ uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
   return vcgtq_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcgtq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1772,7 +1622,7 @@ uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgtq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcgtq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1780,7 +1630,7 @@ uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgtq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcgtq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1788,8 +1638,7 @@ uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgtq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcle_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1797,7 +1646,7 @@ uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
   return vcle_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcle_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1805,7 +1654,7 @@ uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
   return vcle_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcle_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1813,7 +1662,7 @@ uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
   return vcle_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vcle_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1821,7 +1670,7 @@ uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
   return vcle_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcle_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1829,7 +1678,7 @@ uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
   return vcle_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcle_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1837,7 +1686,7 @@ uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
   return vcle_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcle_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1845,7 +1694,7 @@ uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
   return vcle_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcleq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1853,7 +1702,7 @@ uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
   return vcleq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcleq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1861,7 +1710,7 @@ uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
   return vcleq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcleq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1869,7 +1718,7 @@ uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
   return vcleq_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcleq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1877,7 +1726,7 @@ uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
   return vcleq_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcleq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -1885,7 +1734,7 @@ uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
   return vcleq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcleq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -1893,7 +1742,7 @@ uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
   return vcleq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcleq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -1901,67 +1750,57 @@ uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
   return vcleq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcls_s8(
 // CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
   return vcls_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vcls_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[VCLS_V_I]]) #4
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4
 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VCLS_V1_I]]
 int16x4_t test_vcls_s16(int16x4_t a) {
   return vcls_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcls_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[VCLS_V_I]]) #4
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4
 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VCLS_V1_I]]
 int32x2_t test_vcls_s32(int32x2_t a) {
   return vcls_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclsq_s8(
 // CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
   return vclsq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclsq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #4
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4
 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
   return vclsq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclsq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #4
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4
 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
   return vclsq_s32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vclt_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -1969,7 +1808,7 @@ uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
   return vclt_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vclt_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -1977,7 +1816,7 @@ uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
   return vclt_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vclt_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1985,7 +1824,7 @@ uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
   return vclt_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vclt_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -1993,7 +1832,7 @@ uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
   return vclt_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vclt_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[SEXT_I]]
@@ -2001,7 +1840,7 @@ uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
   return vclt_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vclt_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[SEXT_I]]
@@ -2009,7 +1848,7 @@ uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
   return vclt_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vclt_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[SEXT_I]]
@@ -2017,7 +1856,7 @@ uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
   return vclt_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcltq_s8(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2025,7 +1864,7 @@ uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
   return vcltq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcltq_s16(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2033,7 +1872,7 @@ uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
   return vcltq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcltq_s32(
 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2041,7 +1880,7 @@ uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
   return vcltq_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vcltq_f32(
 // CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2049,7 +1888,7 @@ uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
   return vcltq_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcltq_u8(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK:   ret <16 x i8> [[SEXT_I]]
@@ -2057,7 +1896,7 @@ uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
   return vcltq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcltq_u16(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[SEXT_I]]
@@ -2065,7 +1904,7 @@ uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
   return vcltq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcltq_u32(
 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[SEXT_I]]
@@ -2073,253 +1912,233 @@ uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
   return vcltq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclz_s8(
 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
   return vclz_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclz_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 int16x4_t test_vclz_s16(int16x4_t a) {
   return vclz_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclz_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 int32x2_t test_vclz_s32(int32x2_t a) {
   return vclz_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclz_u8(
 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
   return vclz_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclz_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
   return vclz_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclz_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
   return vclz_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclzq_s8(
 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
   return vclzq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclzq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
   return vclzq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclzq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
   return vclzq_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vclzq_u8(
 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   return vclzq_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vclzq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   return vclzq_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vclzq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   return vclzq_u32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcnt_u8(
 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   return vcnt_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcnt_s8(
 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
   return vcnt_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcnt_p8(
 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   return vcnt_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcntq_u8(
 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   return vcntq_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcntq_s8(
 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
   return vcntq_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vcntq_p8(
 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   return vcntq_p8(a);
 }
 
-
-// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcombine_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
   return vcombine_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcombine_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
   return vcombine_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcombine_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
   return vcombine_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vcombine_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
   return vcombine_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %a, <4 x half> %b) #0 {
+// CHECK-LABEL: @test_vcombine_f16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
   return vcombine_f16(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vcombine_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
   return vcombine_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcombine_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
   return vcombine_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcombine_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
   return vcombine_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vcombine_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
   return vcombine_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vcombine_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
   return vcombine_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vcombine_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
   return vcombine_p8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vcombine_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
   return vcombine_p16(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vcreate_s8(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
@@ -2327,45 +2146,41 @@ int8x8_t test_vcreate_s8(uint64_t a) {
   return vclz_s8(vcreate_s8(a));
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcreate_s16(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) #4
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 int16x4_t test_vcreate_s16(uint64_t a) {
   return vclz_s16(vcreate_s16(a));
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcreate_s32(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) #4
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 int32x2_t test_vcreate_s32(uint64_t a) {
   return vclz_s32(vcreate_s32(a));
 }
 
-// CHECK-LABEL: define <4 x half> @test_vcreate_f16(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vcreate_f16(uint64_t a) {
   return vcreate_f16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcreate_f32(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vcreate_f32(uint64_t a) {
   return vcreate_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcreate_u8(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
@@ -2373,45 +2188,36 @@ uint8x8_t test_vcreate_u8(uint64_t a) {
   return vclz_s8(vcreate_u8(a));
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcreate_u16(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) #4
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 uint16x4_t test_vcreate_u16(uint64_t a) {
   return vclz_s16(vcreate_u16(a));
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vcreate_u32(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) #4
 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 uint32x2_t test_vcreate_u32(uint64_t a) {
   return vclz_s32(vcreate_u32(a));
 }
 
-
-// We have two ways of lowering that.  Either with one 'vmov d, r, r' or
-// with two 'vmov d[],r'.  LLVM does the latter. We may want to be less
-// strict about the matching pattern if it starts causing problem.
-// CHECK-LABEL: define <1 x i64> @test_vcreate_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
 // CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vcreate_u64(uint64_t a) {
   uint64x1_t tmp = vcreate_u64(a);
   return vadd_u64(tmp, tmp);
-
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vcreate_p8(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4
 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
@@ -2419,7 +2225,7 @@ poly8x8_t test_vcreate_p8(uint64_t a) {
   return vcnt_p8(vcreate_p8(a));
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vcreate_p16(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
@@ -2432,7 +2238,7 @@ poly16x4_t test_vcreate_p16(uint64_t a) {
   return vbsl_p16(tmp, tmp, tmp);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vcreate_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vcreate_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
 // CHECK:   ret <1 x i64> [[ADD_I]]
@@ -2441,11 +2247,9 @@ int64x1_t test_vcreate_s64(uint64_t a) {
   return vadd_s64(tmp, tmp);
 }
 
-
-// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #4
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4
 // CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
 // CHECK:   ret <4 x half> [[TMP1]]
@@ -2453,57 +2257,49 @@ float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
 }
 
-
-// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float>
 // CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   return vcvt_f32_s32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float>
 // CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   return vcvt_f32_u32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float>
 // CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   return vcvtq_f32_s32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float>
 // CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   return vcvtq_f32_u32(a);
 }
 
-
-// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vcvt_f32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
 // CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4
 // CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP1]]
+// CHECK:   ret <4 x float> [[VCVT_F32_F161_I]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   return vcvt_f32_f16(a);
 }
 
-
-// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
@@ -2512,7 +2308,7 @@ float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   return vcvt_n_f32_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
@@ -2521,7 +2317,7 @@ float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   return vcvt_n_f32_u32(a, 1);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
@@ -2530,7 +2326,7 @@ float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   return vcvtq_n_f32_s32(a, 3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
@@ -2539,8 +2335,7 @@ float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   return vcvtq_n_f32_u32(a, 3);
 }
 
-
-// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
@@ -2549,7 +2344,7 @@ int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   return vcvt_n_s32_f32(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
@@ -2558,8 +2353,7 @@ int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   return vcvtq_n_s32_f32(a, 3);
 }
 
-
-// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_n_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
@@ -2568,7 +2362,7 @@ uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   return vcvt_n_u32_f32(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_n_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
@@ -2577,201 +2371,193 @@ uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   return vcvtq_n_u32_f32(a, 3);
 }
 
-
-// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCVT_I]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCVT_I]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
 }
 
-
-// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvt_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[VCVT_I]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vcvtq_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[VCVT_I]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_u8(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   ret <8 x i8> [[SHUFFLE]]
 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
   return vdup_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   ret <4 x i16> [[SHUFFLE]]
 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
   return vdup_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
 // CHECK:   ret <2 x i32> [[SHUFFLE]]
 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
   return vdup_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_s8(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   ret <8 x i8> [[SHUFFLE]]
 int8x8_t test_vdup_lane_s8(int8x8_t a) {
   return vdup_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   ret <4 x i16> [[SHUFFLE]]
 int16x4_t test_vdup_lane_s16(int16x4_t a) {
   return vdup_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
 // CHECK:   ret <2 x i32> [[SHUFFLE]]
 int32x2_t test_vdup_lane_s32(int32x2_t a) {
   return vdup_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_p8(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   ret <8 x i8> [[SHUFFLE]]
 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
   return vdup_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_p16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   ret <4 x i16> [[SHUFFLE]]
 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
   return vdup_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
 // CHECK:   ret <2 x float> [[SHUFFLE]]
 float32x2_t test_vdup_lane_f32(float32x2_t a) {
   return vdup_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_u8(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   ret <16 x i8> [[SHUFFLE]]
 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
   return vdupq_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   ret <8 x i16> [[SHUFFLE]]
 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
   return vdupq_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   ret <4 x i32> [[SHUFFLE]]
 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
   return vdupq_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_s8(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   ret <16 x i8> [[SHUFFLE]]
 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
   return vdupq_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   ret <8 x i16> [[SHUFFLE]]
 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
   return vdupq_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   ret <4 x i32> [[SHUFFLE]]
 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
   return vdupq_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_p8(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK:   ret <16 x i8> [[SHUFFLE]]
 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
   return vdupq_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_p16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   ret <8 x i16> [[SHUFFLE]]
 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
   return vdupq_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   ret <4 x float> [[SHUFFLE]]
 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
   return vdupq_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_s64(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
 // CHECK:   ret <1 x i64> [[SHUFFLE]]
 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   return vdup_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vdup_lane_u64(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
 // CHECK:   ret <1 x i64> [[SHUFFLE]]
 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   return vdup_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_s64(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
 // CHECK:   ret <2 x i64> [[SHUFFLE]]
 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
   return vdupq_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vdupq_lane_u64(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
 // CHECK:   ret <2 x i64> [[SHUFFLE]]
 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
   return vdupq_lane_u64(a, 0);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(i8 zeroext %a) #0 {
+// CHECK-LABEL: @test_vdup_n_u8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -2785,7 +2571,7 @@ uint8x8_t test_vdup_n_u8(uint8_t a) {
   return vdup_n_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(i16 zeroext %a) #0 {
+// CHECK-LABEL: @test_vdup_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -2795,7 +2581,7 @@ uint16x4_t test_vdup_n_u16(uint16_t a) {
   return vdup_n_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vdup_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
@@ -2803,7 +2589,7 @@ uint32x2_t test_vdup_n_u32(uint32_t a) {
   return vdup_n_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(i8 signext %a) #0 {
+// CHECK-LABEL: @test_vdup_n_s8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -2817,7 +2603,7 @@ int8x8_t test_vdup_n_s8(int8_t a) {
   return vdup_n_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(i16 signext %a) #0 {
+// CHECK-LABEL: @test_vdup_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -2827,7 +2613,7 @@ int16x4_t test_vdup_n_s16(int16_t a) {
   return vdup_n_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vdup_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
@@ -2835,7 +2621,7 @@ int32x2_t test_vdup_n_s32(int32_t a) {
   return vdup_n_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(i8 signext %a) #0 {
+// CHECK-LABEL: @test_vdup_n_p8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -2849,7 +2635,7 @@ poly8x8_t test_vdup_n_p8(poly8_t a) {
   return vdup_n_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(i16 signext %a) #0 {
+// CHECK-LABEL: @test_vdup_n_p16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -2859,7 +2645,7 @@ poly16x4_t test_vdup_n_p16(poly16_t a) {
   return vdup_n_p16(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vdup_n_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vdup_n_f16(
 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
 // CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
 // CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
@@ -2870,7 +2656,7 @@ float16x4_t test_vdup_n_f16(float16_t *a) {
   return vdup_n_f16(*a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vdup_n_f32(float %a) #0 {
+// CHECK-LABEL: @test_vdup_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
 // CHECK:   ret <2 x float> [[VECINIT1_I]]
@@ -2878,7 +2664,7 @@ float32x2_t test_vdup_n_f32(float32_t a) {
   return vdup_n_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(i8 zeroext %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_u8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -2900,7 +2686,7 @@ uint8x16_t test_vdupq_n_u8(uint8_t a) {
   return vdupq_n_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(i16 zeroext %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -2914,7 +2700,7 @@ uint16x8_t test_vdupq_n_u16(uint16_t a) {
   return vdupq_n_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
@@ -2924,7 +2710,7 @@ uint32x4_t test_vdupq_n_u32(uint32_t a) {
   return vdupq_n_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(i8 signext %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_s8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -2946,7 +2732,7 @@ int8x16_t test_vdupq_n_s8(int8_t a) {
   return vdupq_n_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(i16 signext %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -2960,7 +2746,7 @@ int16x8_t test_vdupq_n_s16(int16_t a) {
   return vdupq_n_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
@@ -2970,7 +2756,7 @@ int32x4_t test_vdupq_n_s32(int32_t a) {
   return vdupq_n_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(i8 signext %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_p8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -2992,7 +2778,7 @@ poly8x16_t test_vdupq_n_p8(poly8_t a) {
   return vdupq_n_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(i16 signext %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_p16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -3006,7 +2792,7 @@ poly16x8_t test_vdupq_n_p16(poly16_t a) {
   return vdupq_n_p16(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_f16(
 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
 // CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
 // CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
@@ -3021,7 +2807,7 @@ float16x8_t test_vdupq_n_f16(float16_t *a) {
   return vdupq_n_f16(*a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
@@ -3031,7 +2817,7 @@ float32x4_t test_vdupq_n_f32(float32_t a) {
   return vdupq_n_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vdup_n_s64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
 // CHECK:   ret <1 x i64> [[ADD_I]]
@@ -3040,17 +2826,16 @@ int64x1_t test_vdup_n_s64(int64_t a) {
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vdup_n_u64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
 // CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vdup_n_u64(uint64_t a) {
   int64x1_t tmp = vdup_n_u64(a);
   return vadd_s64(tmp, tmp);
-
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_s64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
@@ -3060,7 +2845,7 @@ int64x2_t test_vdupq_n_s64(int64_t a) {
   return vaddq_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vdupq_n_u64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
@@ -3070,142 +2855,140 @@ uint64x2_t test_vdupq_n_u64(uint64_t a) {
   return vaddq_u64(tmp, tmp);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_veor_s8(
 // CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
   return veor_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_veor_s16(
 // CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
   return veor_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_veor_s32(
 // CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
   return veor_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_veor_s64(
 // CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
   return veor_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_veor_u8(
 // CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
   return veor_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_veor_u16(
 // CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
   return veor_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_veor_u32(
 // CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
   return veor_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_veor_u64(
 // CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
   return veor_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_veorq_s8(
 // CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
   return veorq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_veorq_s16(
 // CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
   return veorq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_veorq_s32(
 // CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
   return veorq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_veorq_s64(
 // CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
   return veorq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_veorq_u8(
 // CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
   return veorq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_veorq_u16(
 // CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
   return veorq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_veorq_u32(
 // CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
   return veorq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_veorq_u64(
 // CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
   return veorq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vext_s8(
 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 // CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   return vext_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vext_u8(
 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 // CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   return vext_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vext_p8(
 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 // CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   return vext_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vext_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -3216,7 +2999,7 @@ int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   return vext_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vext_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -3227,7 +3010,7 @@ uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   return vext_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vext_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -3238,7 +3021,7 @@ poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   return vext_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vext_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -3249,7 +3032,7 @@ int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   return vext_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vext_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -3260,7 +3043,7 @@ uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   return vext_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vext_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -3271,7 +3054,7 @@ int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   return vext_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vext_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -3282,7 +3065,7 @@ uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   return vext_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vext_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
@@ -3293,28 +3076,28 @@ float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   return vext_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vextq_s8(
 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 // CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   return vextq_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vextq_u8(
 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 // CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   return vextq_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vextq_p8(
 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 // CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   return vextq_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vextq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -3325,7 +3108,7 @@ int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   return vextq_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vextq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -3336,7 +3119,7 @@ uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   return vextq_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vextq_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -3347,7 +3130,7 @@ poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   return vextq_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vextq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -3358,7 +3141,7 @@ int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   return vextq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vextq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -3369,7 +3152,7 @@ uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   return vextq_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vextq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -3380,7 +3163,7 @@ int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   return vextq_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vextq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -3391,7 +3174,7 @@ uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   return vextq_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vextq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
@@ -3402,155 +3185,140 @@ float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   return vextq_f32(a, b, 3);
 }
 
-
-// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vfma_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
-// CHECK:   ret <2 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %c, <2 x float> %a) #4
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfma_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK-LABEL: @test_vfmaq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
-// CHECK:   ret <4 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a) #4
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vfms_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
-// CHECK:   ret <2 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %c, <2 x float> %a) #4
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfms_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK-LABEL: @test_vfmsq_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
-// CHECK:   ret <4 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %c, <4 x float> %a) #4
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmsq_f32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_high_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
   return vget_high_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_high_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
   return vget_high_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vget_high_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
   return vget_high_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vget_high_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
   return vget_high_s64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vget_high_f16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
   return vget_high_f16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vget_high_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
   return vget_high_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_high_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
   return vget_high_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_high_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
   return vget_high_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vget_high_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
   return vget_high_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vget_high_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
   return vget_high_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_high_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
   return vget_high_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_high_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
   return vget_high_p16(a);
 }
 
-
-// CHECK-LABEL: define zeroext i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_u8(
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
 // CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
   return vget_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: define zeroext i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
@@ -3559,7 +3327,7 @@ uint16_t test_vget_lane_u16(uint16x4_t a) {
   return vget_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
@@ -3568,14 +3336,14 @@ uint32_t test_vget_lane_u32(uint32x2_t a) {
   return vget_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: define signext i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_s8(
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
 // CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
   return vget_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: define signext i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
@@ -3584,7 +3352,7 @@ int16_t test_vget_lane_s16(int16x4_t a) {
   return vget_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
@@ -3593,14 +3361,14 @@ int32_t test_vget_lane_s32(int32x2_t a) {
   return vget_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: define signext i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_p8(
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
 // CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
   return vget_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: define signext i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
@@ -3609,7 +3377,7 @@ poly16_t test_vget_lane_p16(poly16x4_t a) {
   return vget_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
@@ -3618,7 +3386,7 @@ float32_t test_vget_lane_f32(float32x2_t a) {
   return vget_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_f16(
 // CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
 // CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
 // CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
@@ -3636,14 +3404,14 @@ float32_t test_vget_lane_f16(float16x4_t a) {
   return vget_lane_f16(a, 1);
 }
 
-// CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_u8(
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
 // CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
   return vgetq_lane_u8(a, 15);
 }
 
-// CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
@@ -3652,7 +3420,7 @@ uint16_t test_vgetq_lane_u16(uint16x8_t a) {
   return vgetq_lane_u16(a, 7);
 }
 
-// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
@@ -3661,14 +3429,14 @@ uint32_t test_vgetq_lane_u32(uint32x4_t a) {
   return vgetq_lane_u32(a, 3);
 }
 
-// CHECK-LABEL: define signext i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_s8(
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
 // CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
   return vgetq_lane_s8(a, 15);
 }
 
-// CHECK-LABEL: define signext i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
@@ -3677,7 +3445,7 @@ int16_t test_vgetq_lane_s16(int16x8_t a) {
   return vgetq_lane_s16(a, 7);
 }
 
-// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
@@ -3686,14 +3454,14 @@ int32_t test_vgetq_lane_s32(int32x4_t a) {
   return vgetq_lane_s32(a, 3);
 }
 
-// CHECK-LABEL: define signext i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_p8(
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
 // CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
   return vgetq_lane_p8(a, 15);
 }
 
-// CHECK-LABEL: define signext i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
@@ -3702,7 +3470,7 @@ poly16_t test_vgetq_lane_p16(poly16x8_t a) {
   return vgetq_lane_p16(a, 7);
 }
 
-// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
@@ -3711,7 +3479,7 @@ float32_t test_vgetq_lane_f32(float32x4_t a) {
   return vgetq_lane_f32(a, 3);
 }
 
-// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_f16(
 // CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
 // CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
 // CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
@@ -3729,8 +3497,7 @@ float32_t test_vgetq_lane_f16(float16x8_t a) {
   return vgetq_lane_f16(a, 3);
 }
 
-// The optimizer is able to remove all moves now.
-// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
@@ -3739,8 +3506,7 @@ int64_t test_vget_lane_s64(int64x1_t a) {
   return vget_lane_s64(a, 0);
 }
 
-// The optimizer is able to remove all moves now.
-// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vget_lane_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
@@ -3749,7 +3515,7 @@ uint64_t test_vget_lane_u64(uint64x1_t a) {
   return vget_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
@@ -3758,7 +3524,7 @@ int64_t test_vgetq_lane_s64(int64x2_t a) {
   return vgetq_lane_s64(a, 1);
 }
 
-// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vgetq_lane_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
@@ -3767,366 +3533,314 @@ uint64_t test_vgetq_lane_u64(uint64x2_t a) {
   return vgetq_lane_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_low_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
   return vget_low_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_low_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
   return vget_low_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vget_low_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
   return vget_low_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vget_low_s64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
   return vget_low_s64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vget_low_f16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
   return vget_low_f16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vget_low_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
   return vget_low_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_low_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
   return vget_low_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_low_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
   return vget_low_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vget_low_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
   return vget_low_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vget_low_u64(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
   return vget_low_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vget_low_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
   return vget_low_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vget_low_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
   return vget_low_p16(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vhadd_s8(
 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
   return vhadd_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vhadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
   return vhadd_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vhadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
   return vhadd_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vhadd_u8(
 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vhadd_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vhadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vhadd_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vhadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vhadd_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vhaddq_s8(
 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
   return vhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vhaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
   return vhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vhaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
   return vhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vhaddq_u8(
 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vhaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vhaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vhaddq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vhsub_s8(
 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
   return vhsub_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vhsub_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
   return vhsub_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vhsub_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
   return vhsub_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vhsub_u8(
 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
   return vhsub_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vhsub_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
   return vhsub_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vhsub_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
   return vhsub_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vhsubq_s8(
 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
   return vhsubq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vhsubq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
   return vhsubq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vhsubq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
   return vhsubq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vhsubq_u8(
 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vhsubq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vhsubq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vhsubq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vhsubq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vhsubq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u8(
 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
 // CHECK:   ret <16 x i8> [[VLD1]]
 uint8x16_t test_vld1q_u8(uint8_t const * a) {
   return vld1q_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
 // CHECK:   ret <8 x i16> [[VLD1]]
@@ -4134,7 +3848,7 @@ uint16x8_t test_vld1q_u16(uint16_t const * a) {
   return vld1q_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <4 x i32> [[VLD1]]
@@ -4142,7 +3856,7 @@ uint32x4_t test_vld1q_u32(uint32_t const * a) {
   return vld1q_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <2 x i64> [[VLD1]]
@@ -4150,14 +3864,14 @@ uint64x2_t test_vld1q_u64(uint64_t const * a) {
   return vld1q_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s8(
 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
 // CHECK:   ret <16 x i8> [[VLD1]]
 int8x16_t test_vld1q_s8(int8_t const * a) {
   return vld1q_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
 // CHECK:   ret <8 x i16> [[VLD1]]
@@ -4165,7 +3879,7 @@ int16x8_t test_vld1q_s16(int16_t const * a) {
   return vld1q_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <4 x i32> [[VLD1]]
@@ -4173,7 +3887,7 @@ int32x4_t test_vld1q_s32(int32_t const * a) {
   return vld1q_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <2 x i64> [[VLD1]]
@@ -4181,7 +3895,7 @@ int64x2_t test_vld1q_s64(int64_t const * a) {
   return vld1q_s64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
@@ -4190,7 +3904,7 @@ float16x8_t test_vld1q_f16(float16_t const * a) {
   return vld1q_f16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld1q_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <4 x float> [[VLD1]]
@@ -4198,14 +3912,14 @@ float32x4_t test_vld1q_f32(float32_t const * a) {
   return vld1q_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p8(
 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
 // CHECK:   ret <16 x i8> [[VLD1]]
 poly8x16_t test_vld1q_p8(poly8_t const * a) {
   return vld1q_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
 // CHECK:   ret <8 x i16> [[VLD1]]
@@ -4213,14 +3927,14 @@ poly16x8_t test_vld1q_p16(poly16_t const * a) {
   return vld1q_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_u8(
 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
 // CHECK:   ret <8 x i8> [[VLD1]]
 uint8x8_t test_vld1_u8(uint8_t const * a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
 // CHECK:   ret <4 x i16> [[VLD1]]
@@ -4228,7 +3942,7 @@ uint16x4_t test_vld1_u16(uint16_t const * a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <2 x i32> [[VLD1]]
@@ -4236,7 +3950,7 @@ uint32x2_t test_vld1_u32(uint32_t const * a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <1 x i64> [[VLD1]]
@@ -4244,14 +3958,14 @@ uint64x1_t test_vld1_u64(uint64_t const * a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_s8(
 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
 // CHECK:   ret <8 x i8> [[VLD1]]
 int8x8_t test_vld1_s8(int8_t const * a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
 // CHECK:   ret <4 x i16> [[VLD1]]
@@ -4259,7 +3973,7 @@ int16x4_t test_vld1_s16(int16_t const * a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <2 x i32> [[VLD1]]
@@ -4267,7 +3981,7 @@ int32x2_t test_vld1_s32(int32_t const * a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <1 x i64> [[VLD1]]
@@ -4275,7 +3989,7 @@ int64x1_t test_vld1_s64(int64_t const * a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
@@ -4284,7 +3998,7 @@ float16x4_t test_vld1_f16(float16_t const * a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld1_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* [[TMP0]], i32 4)
 // CHECK:   ret <2 x float> [[VLD1]]
@@ -4292,14 +4006,14 @@ float32x2_t test_vld1_f32(float32_t const * a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_p8(
 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
 // CHECK:   ret <8 x i8> [[VLD1]]
 poly8x8_t test_vld1_p8(poly8_t const * a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
 // CHECK:   ret <4 x i16> [[VLD1]]
@@ -4307,8 +4021,7 @@ poly16x4_t test_vld1_p16(poly16_t const * a) {
   return vld1_p16(a);
 }
 
-
-// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_u8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
@@ -4317,7 +4030,7 @@ uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
   return vld1q_dup_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
@@ -4328,7 +4041,7 @@ uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
   return vld1q_dup_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
@@ -4339,7 +4052,7 @@ uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
   return vld1q_dup_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
@@ -4350,7 +4063,7 @@ uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
   return vld1q_dup_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_s8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
@@ -4359,7 +4072,7 @@ int8x16_t test_vld1q_dup_s8(int8_t const * a) {
   return vld1q_dup_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
@@ -4370,7 +4083,7 @@ int16x8_t test_vld1q_dup_s16(int16_t const * a) {
   return vld1q_dup_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
@@ -4381,7 +4094,7 @@ int32x4_t test_vld1q_dup_s32(int32_t const * a) {
   return vld1q_dup_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
@@ -4392,7 +4105,7 @@ int64x2_t test_vld1q_dup_s64(int64_t const * a) {
   return vld1q_dup_s64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
@@ -4404,7 +4117,7 @@ float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   return vld1q_dup_f16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
 // CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
@@ -4415,7 +4128,7 @@ float32x4_t test_vld1q_dup_f32(float32_t const * a) {
   return vld1q_dup_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_p8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
@@ -4424,7 +4137,7 @@ poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
   return vld1q_dup_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1q_dup_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
@@ -4435,7 +4148,7 @@ poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
   return vld1q_dup_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_u8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
@@ -4444,7 +4157,7 @@ uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
   return vld1_dup_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
@@ -4455,7 +4168,7 @@ uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
   return vld1_dup_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
@@ -4466,7 +4179,7 @@ uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
   return vld1_dup_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
@@ -4477,7 +4190,7 @@ uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
   return vld1_dup_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_s8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
@@ -4486,7 +4199,7 @@ int8x8_t test_vld1_dup_s8(int8_t const * a) {
   return vld1_dup_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
@@ -4497,7 +4210,7 @@ int16x4_t test_vld1_dup_s16(int16_t const * a) {
   return vld1_dup_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
@@ -4508,7 +4221,7 @@ int32x2_t test_vld1_dup_s32(int32_t const * a) {
   return vld1_dup_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
@@ -4519,7 +4232,7 @@ int64x1_t test_vld1_dup_s64(int64_t const * a) {
   return vld1_dup_s64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
@@ -4531,7 +4244,7 @@ float16x4_t test_vld1_dup_f16(float16_t const * a) {
   return vld1_dup_f16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
 // CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
@@ -4542,7 +4255,7 @@ float32x2_t test_vld1_dup_f32(float32_t const * a) {
   return vld1_dup_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_p8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
@@ -4551,7 +4264,7 @@ poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
   return vld1_dup_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
+// CHECK-LABEL: @test_vld1_dup_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
@@ -4562,8 +4275,7 @@ poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
   return vld1_dup_p16(a);
 }
 
-
-// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_u8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
@@ -4571,7 +4283,7 @@ uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
   return vld1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -4583,7 +4295,7 @@ uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
   return vld1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
@@ -4595,7 +4307,7 @@ uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
   return vld1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
@@ -4607,7 +4319,7 @@ uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
   return vld1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_s8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
@@ -4615,7 +4327,7 @@ int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
   return vld1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -4627,7 +4339,7 @@ int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
   return vld1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
@@ -4639,7 +4351,7 @@ int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
   return vld1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
@@ -4651,7 +4363,7 @@ int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
   return vld1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -4664,7 +4376,7 @@ float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
@@ -4676,7 +4388,7 @@ float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
   return vld1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_p8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
@@ -4684,7 +4396,7 @@ poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
   return vld1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vld1q_lane_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -4696,7 +4408,7 @@ poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
   return vld1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_u8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
@@ -4704,7 +4416,7 @@ uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
   return vld1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -4716,7 +4428,7 @@ uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
   return vld1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
@@ -4728,7 +4440,7 @@ uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
   return vld1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
@@ -4740,7 +4452,7 @@ uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
   return vld1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_s8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
@@ -4748,7 +4460,7 @@ int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
   return vld1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -4760,7 +4472,7 @@ int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
   return vld1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
@@ -4772,7 +4484,7 @@ int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
   return vld1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
@@ -4784,7 +4496,7 @@ int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
   return vld1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -4797,7 +4509,7 @@ float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
@@ -4809,7 +4521,7 @@ float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
   return vld1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_p8(
 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
@@ -4817,7 +4529,7 @@ poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
   return vld1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vld1_lane_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -4829,571 +4541,304 @@ poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
   return vld1_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: define void @test_vld2q_u8(%struct.uint8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2q_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
   return vld2q_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2q_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
   return vld2q_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld2q_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
   return vld2q_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_s8(%struct.int8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2q_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
   return vld2q_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2q_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 int16x8x2_t test_vld2q_s16(int16_t const * a) {
   return vld2q_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld2q_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
 int32x4x2_t test_vld2q_s32(int32_t const * a) {
   return vld2q_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld2q_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 float16x8x2_t test_vld2q_f16(float16_t const * a) {
   return vld2q_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld2q_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_V]], { <4 x float>, <4 x float> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float>
 float32x4x2_t test_vld2q_f32(float32_t const * a) {
   return vld2q_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_p8(%struct.poly8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2q_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
   return vld2q_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2q_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2q_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
   return vld2q_p16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
   return vld2_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
   return vld2_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld2_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
   return vld2_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld2_u64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
   return vld2_u64(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
 int8x8x2_t test_vld2_s8(int8_t const * a) {
   return vld2_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 int16x4x2_t test_vld2_s16(int16_t const * a) {
   return vld2_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld2_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
 int32x2x2_t test_vld2_s32(int32_t const * a) {
   return vld2_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld2_s64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
 int64x1x2_t test_vld2_s64(int64_t const * a) {
   return vld2_s64(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld2_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_f16(float16_t const * a) {
   return vld2_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld2_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_V]], { <2 x float>, <2 x float> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float>
 float32x2x2_t test_vld2_f32(float32_t const * a) {
   return vld2_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
   return vld2_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
   return vld2_p16(a);
 }
 
-
-// CHECK-LABEL: define void @test_vld2_dup_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
 uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
   return vld2_dup_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
   return vld2_dup_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>
 uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
   return vld2_dup_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_u64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>
 uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
   return vld2_dup_u64(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
 int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
   return vld2_dup_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
   return vld2_dup_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>
 int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
   return vld2_dup_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_s64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>
 int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
   return vld2_dup_s64(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
   return vld2_dup_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float> } [[TMP5]], { <2 x float>, <2 x float> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>
 float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
   return vld2_dup_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
 poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
   return vld2_dup_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld2_dup_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld2_dup_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
   return vld2_dup_p16(a);
 }
 
-
-// CHECK-LABEL: define void @test_vld2q_lane_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
@@ -5415,18 +4860,12 @@ poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
   return vld2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld2q_lane_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2q_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
@@ -5448,18 +4887,12 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
   return vld2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld2q_lane_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2q_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
@@ -5481,18 +4914,12 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
   return vld2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld2q_lane_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2q_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
@@ -5514,18 +4941,12 @@ int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
   return vld2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld2q_lane_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2q_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
@@ -5547,18 +4968,12 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld2q_lane_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2q_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
@@ -5580,18 +4995,12 @@ float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], i32 3, i32 4)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], { <4 x float>, <4 x float> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>
 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
   return vld2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld2q_lane_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2q_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
@@ -5613,18 +5022,12 @@ float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
   return vld2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
@@ -5641,18 +5044,12 @@ poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
   return vld2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
@@ -5674,18 +5071,12 @@ uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
   return vld2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
@@ -5707,18 +5098,12 @@ uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
   return vld2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
@@ -5735,18 +5120,12 @@ uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
   return vld2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
@@ -5768,18 +5147,12 @@ int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
   return vld2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
@@ -5801,18 +5174,12 @@ int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
   return vld2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
@@ -5834,18 +5201,12 @@ int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
@@ -5867,18 +5228,12 @@ float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], i32 1, i32 4)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE_V]], { <2 x float>, <2 x float> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float>
 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
   return vld2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
@@ -5895,18 +5250,12 @@ float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
   return vld2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld2_lane_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld2_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
@@ -5928,612 +5277,309 @@ poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
-// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
   return vld2_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: define void @test_vld3q_u8(%struct.uint8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3q_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
   return vld3q_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3q_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
   return vld3q_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld3q_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
   return vld3q_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_s8(%struct.int8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3q_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
   return vld3q_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3q_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 int16x8x3_t test_vld3q_s16(int16_t const * a) {
   return vld3q_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld3q_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
 int32x4x3_t test_vld3q_s32(int32_t const * a) {
   return vld3q_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld3q_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x3_t test_vld3q_f16(float16_t const * a) {
   return vld3q_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld3q_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
 float32x4x3_t test_vld3q_f32(float32_t const * a) {
   return vld3q_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_p8(%struct.poly8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3q_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
   return vld3q_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3q_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3q_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
   return vld3q_p16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
   return vld3_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
   return vld3_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld3_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
   return vld3_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld3_u64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
   return vld3_u64(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x3_t test_vld3_s8(int8_t const * a) {
   return vld3_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x3_t test_vld3_s16(int16_t const * a) {
   return vld3_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld3_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x3_t test_vld3_s32(int32_t const * a) {
   return vld3_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld3_s64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
 int64x1x3_t test_vld3_s64(int64_t const * a) {
   return vld3_s64(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld3_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_f16(float16_t const * a) {
   return vld3_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld3_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
 float32x2x3_t test_vld3_f32(float32_t const * a) {
   return vld3_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
   return vld3_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
   return vld3_p16(a);
 }
 
-
-// CHECK-LABEL: define void @test_vld3_dup_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
   return vld3_dup_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
   return vld3_dup_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
   return vld3_dup_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_u64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
 uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
   return vld3_dup_u64(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
   return vld3_dup_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
   return vld3_dup_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
   return vld3_dup_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_s64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
 int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
   return vld3_dup_s64(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
   return vld3_dup_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[TMP7]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
 float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
   return vld3_dup_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
   return vld3_dup_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld3_dup_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld3_dup_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
   return vld3_dup_p16(a);
 }
 
-
-// CHECK-LABEL: define void @test_vld3q_lane_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
@@ -6560,18 +5606,12 @@ poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
   return vld3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld3q_lane_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3q_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
@@ -6598,18 +5638,12 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
   return vld3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld3q_lane_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3q_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
@@ -6636,18 +5670,12 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
   return vld3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld3q_lane_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3q_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
@@ -6674,18 +5702,12 @@ int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
   return vld3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld3q_lane_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3q_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
@@ -6712,18 +5734,12 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld3q_lane_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3q_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
@@ -6750,18 +5766,12 @@ float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], i32 3, i32 4)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
   return vld3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld3q_lane_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3q_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
@@ -6788,18 +5798,12 @@ float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
   return vld3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
@@ -6819,18 +5823,12 @@ poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
   return vld3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
@@ -6857,18 +5855,12 @@ uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
   return vld3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
@@ -6895,18 +5887,12 @@ uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
   return vld3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
@@ -6926,18 +5912,12 @@ uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
   return vld3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
@@ -6964,18 +5944,12 @@ int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
   return vld3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
@@ -7002,18 +5976,12 @@ int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
   return vld3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
@@ -7040,18 +6008,12 @@ int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
@@ -7078,18 +6040,12 @@ float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], i32 1, i32 4)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
   return vld3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
@@ -7109,18 +6065,12 @@ float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
   return vld3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld3_lane_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld3_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
@@ -7147,642 +6097,309 @@ poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
-// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
-// CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
   return vld3_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: define void @test_vld4q_u8(%struct.uint8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4q_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
   return vld4q_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4q_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
   return vld4q_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld4q_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
   return vld4q_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_s8(%struct.int8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4q_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
   return vld4q_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4q_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 int16x8x4_t test_vld4q_s16(int16_t const * a) {
   return vld4q_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld4q_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
 int32x4x4_t test_vld4q_s32(int32_t const * a) {
   return vld4q_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld4q_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x4_t test_vld4q_f16(float16_t const * a) {
   return vld4q_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld4q_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
 float32x4x4_t test_vld4q_f32(float32_t const * a) {
   return vld4q_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_p8(%struct.poly8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4q_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
   return vld4q_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4q_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4q_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
   return vld4q_p16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
   return vld4_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
   return vld4_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld4_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
   return vld4_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld4_u64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
   return vld4_u64(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x4_t test_vld4_s8(int8_t const * a) {
   return vld4_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x4_t test_vld4_s16(int16_t const * a) {
   return vld4_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld4_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x4_t test_vld4_s32(int32_t const * a) {
   return vld4_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld4_s64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
 int64x1x4_t test_vld4_s64(int64_t const * a) {
   return vld4_s64(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld4_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_f16(float16_t const * a) {
   return vld4_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld4_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
 float32x2x4_t test_vld4_f32(float32_t const * a) {
   return vld4_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
   return vld4_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
   return vld4_p16(a);
 }
 
-
-// CHECK-LABEL: define void @test_vld4_dup_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_u8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
-// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
   return vld4_dup_u8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_u16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
   return vld4_dup_u16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_u32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
   return vld4_dup_u32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_u64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
 uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
   return vld4_dup_u64(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_s8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
-// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
   return vld4_dup_s8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_s16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
   return vld4_dup_s16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_s32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
   return vld4_dup_s32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_s64(
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
 int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
   return vld4_dup_s64(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_f16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
   return vld4_dup_f16(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_f32(
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], <2 x float> [[LANE3]], 3
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP9]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
 float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
   return vld4_dup_f32(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_p8(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
-// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
-// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
-// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
   return vld4_dup_p8(a);
 }
 
-// CHECK-LABEL: define void @test_vld4_dup_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK-LABEL: @test_vld4_dup_p16(
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
-// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
-// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
-// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
-// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
-// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
-// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
   return vld4_dup_p16(a);
 }
 
-
-// CHECK-LABEL: define void @test_vld4q_lane_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
@@ -7814,18 +6431,12 @@ poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
   return vld4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld4q_lane_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4q_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
@@ -7857,18 +6468,12 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
   return vld4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld4q_lane_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4q_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
@@ -7900,18 +6505,12 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
   return vld4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld4q_lane_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4q_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
@@ -7943,18 +6542,12 @@ int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
   return vld4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld4q_lane_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4q_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
@@ -7986,18 +6579,12 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld4q_lane_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4q_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
@@ -8029,18 +6616,12 @@ float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], <4 x float> [[TMP16]], i32 3, i32 4)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
   return vld4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld4q_lane_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4q_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
@@ -8072,18 +6653,12 @@ float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
   return vld4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
@@ -8106,18 +6681,12 @@ poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
   return vld4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
@@ -8149,18 +6718,12 @@ uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
   return vld4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
@@ -8192,18 +6755,12 @@ uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
   return vld4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
@@ -8226,18 +6783,12 @@ uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
   return vld4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
@@ -8269,18 +6820,12 @@ int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
   return vld4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
@@ -8312,18 +6857,12 @@ int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
   return vld4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
@@ -8355,18 +6894,12 @@ int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
@@ -8398,18 +6931,12 @@ float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x float> [[TMP16]], i32 1, i32 4)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
   return vld4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
@@ -8432,18 +6959,12 @@ float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
-// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
-// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
   return vld4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vld4_lane_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vld4_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
@@ -8475,337 +6996,268 @@ poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
-// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
-// CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
-// CHECK:   ret void
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
   return vld4_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmax_s8(
 // CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMAX_V_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   return vmax_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmax_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VMAX_V2_I]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   return vmax_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmax_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VMAX_V2_I]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   return vmax_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmax_u8(
 // CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMAX_V_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   return vmax_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmax_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VMAX_V2_I]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   return vmax_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmax_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VMAX_V2_I]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   return vmax_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmax_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[VMAX_V_I]], <2 x float> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VMAX_V2_I]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   return vmax_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_s8(
 // CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   return vmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VMAXQ_V2_I]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   return vmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VMAXQ_V2_I]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   return vmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_u8(
 // CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VMAXQ_V2_I]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VMAXQ_V2_I]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vmaxq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[VMAXQ_V_I]], <4 x float> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   ret <4 x float> [[VMAXQ_V2_I]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   return vmaxq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmin_s8(
 // CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMIN_V_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   return vmin_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmin_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VMIN_V2_I]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   return vmin_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmin_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VMIN_V2_I]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   return vmin_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmin_u8(
 // CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMIN_V_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   return vmin_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmin_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VMIN_V2_I]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   return vmin_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmin_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VMIN_V2_I]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   return vmin_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmin_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[VMIN_V_I]], <2 x float> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VMIN_V2_I]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   return vmin_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vminq_s8(
 // CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMINQ_V_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   return vminq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vminq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VMINQ_V2_I]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   return vminq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vminq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VMINQ_V2_I]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   return vminq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vminq_u8(
 // CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMINQ_V_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   return vminq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vminq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VMINQ_V2_I]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   return vminq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vminq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VMINQ_V2_I]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   return vminq_u32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vminq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[VMINQ_V_I]], <4 x float> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   ret <4 x float> [[VMINQ_V2_I]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   return vminq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmla_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
 // CHECK:   ret <8 x i8> [[ADD_I]]
@@ -8813,7 +7265,7 @@ int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmla_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmla_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
 // CHECK:   ret <4 x i16> [[ADD_I]]
@@ -8821,7 +7273,7 @@ int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmla_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
 // CHECK:   ret <2 x i32> [[ADD_I]]
@@ -8829,7 +7281,7 @@ int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vmla_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
 // CHECK:   ret <2 x float> [[ADD_I]]
@@ -8837,7 +7289,7 @@ float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmla_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
 // CHECK:   ret <8 x i8> [[ADD_I]]
@@ -8845,7 +7297,7 @@ uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmla_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmla_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
 // CHECK:   ret <4 x i16> [[ADD_I]]
@@ -8853,7 +7305,7 @@ uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmla_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
 // CHECK:   ret <2 x i32> [[ADD_I]]
@@ -8861,7 +7313,7 @@ uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
 // CHECK:   ret <16 x i8> [[ADD_I]]
@@ -8869,7 +7321,7 @@ int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -8877,7 +7329,7 @@ int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
@@ -8885,7 +7337,7 @@ int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
 // CHECK:   ret <4 x float> [[ADD_I]]
@@ -8893,7 +7345,7 @@ float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
 // CHECK:   ret <16 x i8> [[ADD_I]]
@@ -8901,7 +7353,7 @@ uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -8909,7 +7361,7 @@ uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
@@ -8917,8 +7369,7 @@ uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlaq_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlal_s8(
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -8926,31 +7377,27 @@ int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlal_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlal_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlal_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlal_u8(
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
 // CHECK:   ret <8 x i16> [[ADD_I]]
@@ -8958,146 +7405,123 @@ uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlal_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlal_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlal_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlal_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlal_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlal_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlal_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_lane_u32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK-LABEL: @test_vmlal_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlal_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
+// CHECK-LABEL: @test_vmlal_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlal_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlal_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlal_n_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmla_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -9106,7 +7530,7 @@ int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmla_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -9115,7 +7539,7 @@ int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmla_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
@@ -9124,7 +7548,7 @@ uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmla_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
@@ -9133,7 +7557,7 @@ uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vmla_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
@@ -9142,7 +7566,7 @@ float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -9151,7 +7575,7 @@ int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlaq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -9160,7 +7584,7 @@ int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlaq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
@@ -9169,7 +7593,7 @@ uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlaq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
@@ -9178,7 +7602,7 @@ uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlaq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vmlaq_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
 // CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
@@ -9187,8 +7611,7 @@ float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlaq_lane_f32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK-LABEL: @test_vmla_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -9200,7 +7623,7 @@ int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmla_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmla_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
@@ -9210,7 +7633,7 @@ int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmla_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
+// CHECK-LABEL: @test_vmla_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -9222,7 +7645,7 @@ uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmla_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmla_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
@@ -9232,7 +7655,7 @@ uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmla_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK-LABEL: @test_vmla_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
@@ -9242,7 +7665,7 @@ float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmla_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -9258,7 +7681,7 @@ int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlaq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
@@ -9270,7 +7693,7 @@ int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlaq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -9286,7 +7709,7 @@ uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlaq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
@@ -9298,7 +7721,7 @@ uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlaq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK-LABEL: @test_vmlaq_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
@@ -9310,8 +7733,7 @@ float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlaq_n_f32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmls_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
 // CHECK:   ret <8 x i8> [[SUB_I]]
@@ -9319,7 +7741,7 @@ int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmls_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmls_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
 // CHECK:   ret <4 x i16> [[SUB_I]]
@@ -9327,7 +7749,7 @@ int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmls_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
 // CHECK:   ret <2 x i32> [[SUB_I]]
@@ -9335,7 +7757,7 @@ int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vmls_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
 // CHECK:   ret <2 x float> [[SUB_I]]
@@ -9343,7 +7765,7 @@ float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmls_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
 // CHECK:   ret <8 x i8> [[SUB_I]]
@@ -9351,7 +7773,7 @@ uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmls_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmls_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
 // CHECK:   ret <4 x i16> [[SUB_I]]
@@ -9359,7 +7781,7 @@ uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmls_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
 // CHECK:   ret <2 x i32> [[SUB_I]]
@@ -9367,7 +7789,7 @@ uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
 // CHECK:   ret <16 x i8> [[SUB_I]]
@@ -9375,7 +7797,7 @@ int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlsq_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -9383,7 +7805,7 @@ int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlsq_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
@@ -9391,7 +7813,7 @@ int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlsq_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
 // CHECK:   ret <4 x float> [[SUB_I]]
@@ -9399,7 +7821,7 @@ float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlsq_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
 // CHECK:   ret <16 x i8> [[SUB_I]]
@@ -9407,7 +7829,7 @@ uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlsq_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -9415,7 +7837,7 @@ uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlsq_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
@@ -9423,8 +7845,7 @@ uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlsq_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_s8(
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -9432,31 +7853,27 @@ int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_u8(
 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -9464,146 +7881,123 @@ uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) #4
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlsl_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_lane_u32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK-LABEL: @test_vmlsl_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
+// CHECK-LABEL: @test_vmlsl_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlsl_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsl_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlsl_n_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmls_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -9612,7 +8006,7 @@ int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmls_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -9621,7 +8015,7 @@ int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmls_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
@@ -9630,7 +8024,7 @@ uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmls_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
@@ -9639,7 +8033,7 @@ uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vmls_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
@@ -9648,7 +8042,7 @@ float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -9657,7 +8051,7 @@ int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlsq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -9666,7 +8060,7 @@ int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlsq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
@@ -9675,7 +8069,7 @@ uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlsq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
@@ -9684,7 +8078,7 @@ uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlsq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vmlsq_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
 // CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
@@ -9693,8 +8087,7 @@ float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlsq_lane_f32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK-LABEL: @test_vmls_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -9706,7 +8099,7 @@ int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmls_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmls_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
@@ -9716,7 +8109,7 @@ int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmls_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
+// CHECK-LABEL: @test_vmls_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -9728,7 +8121,7 @@ uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmls_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmls_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
@@ -9738,7 +8131,7 @@ uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmls_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK-LABEL: @test_vmls_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
@@ -9748,7 +8141,7 @@ float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmls_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -9764,7 +8157,7 @@ int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlsq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
@@ -9776,7 +8169,7 @@ int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlsq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
@@ -9792,7 +8185,7 @@ uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlsq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
@@ -9804,7 +8197,7 @@ uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlsq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK-LABEL: @test_vmlsq_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
@@ -9816,114 +8209,101 @@ float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlsq_n_f32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmovl_s8(
 // CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
   return vmovl_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
   return vmovl_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
   return vmovl_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmovl_u8(
 // CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   return vmovl_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   return vmovl_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   return vmovl_u32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
   return vmovn_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
   return vmovn_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vmovn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
   return vmovn_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmovn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   return vmovn_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmovn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   return vmovn_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vmovn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   return vmovn_u64(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(i8 zeroext %a) #0 {
+// CHECK-LABEL: @test_vmov_n_u8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -9937,7 +8317,7 @@ uint8x8_t test_vmov_n_u8(uint8_t a) {
   return vmov_n_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(i16 zeroext %a) #0 {
+// CHECK-LABEL: @test_vmov_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -9947,7 +8327,7 @@ uint16x4_t test_vmov_n_u16(uint16_t a) {
   return vmov_n_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vmov_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
@@ -9955,7 +8335,7 @@ uint32x2_t test_vmov_n_u32(uint32_t a) {
   return vmov_n_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(i8 signext %a) #0 {
+// CHECK-LABEL: @test_vmov_n_s8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -9969,7 +8349,7 @@ int8x8_t test_vmov_n_s8(int8_t a) {
   return vmov_n_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(i16 signext %a) #0 {
+// CHECK-LABEL: @test_vmov_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -9979,7 +8359,7 @@ int16x4_t test_vmov_n_s16(int16_t a) {
   return vmov_n_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vmov_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
@@ -9987,7 +8367,7 @@ int32x2_t test_vmov_n_s32(int32_t a) {
   return vmov_n_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(i8 signext %a) #0 {
+// CHECK-LABEL: @test_vmov_n_p8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -10001,7 +8381,7 @@ poly8x8_t test_vmov_n_p8(poly8_t a) {
   return vmov_n_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(i16 signext %a) #0 {
+// CHECK-LABEL: @test_vmov_n_p16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -10011,7 +8391,7 @@ poly16x4_t test_vmov_n_p16(poly16_t a) {
   return vmov_n_p16(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vmov_n_f16(
 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
 // CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
 // CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
@@ -10022,7 +8402,7 @@ float16x4_t test_vmov_n_f16(float16_t *a) {
   return vmov_n_f16(*a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmov_n_f32(float %a) #0 {
+// CHECK-LABEL: @test_vmov_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
 // CHECK:   ret <2 x float> [[VECINIT1_I]]
@@ -10030,7 +8410,7 @@ float32x2_t test_vmov_n_f32(float32_t a) {
   return vmov_n_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(i8 zeroext %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_u8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -10052,7 +8432,7 @@ uint8x16_t test_vmovq_n_u8(uint8_t a) {
   return vmovq_n_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(i16 zeroext %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -10066,7 +8446,7 @@ uint16x8_t test_vmovq_n_u16(uint16_t a) {
   return vmovq_n_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(i32 %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
@@ -10076,7 +8456,7 @@ uint32x4_t test_vmovq_n_u32(uint32_t a) {
   return vmovq_n_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(i8 signext %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_s8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -10098,7 +8478,7 @@ int8x16_t test_vmovq_n_s8(int8_t a) {
   return vmovq_n_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(i16 signext %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -10112,7 +8492,7 @@ int16x8_t test_vmovq_n_s16(int16_t a) {
   return vmovq_n_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(i32 %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
@@ -10122,7 +8502,7 @@ int32x4_t test_vmovq_n_s32(int32_t a) {
   return vmovq_n_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(i8 signext %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_p8(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
@@ -10144,7 +8524,7 @@ poly8x16_t test_vmovq_n_p8(poly8_t a) {
   return vmovq_n_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(i16 signext %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_p16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
@@ -10158,7 +8538,7 @@ poly16x8_t test_vmovq_n_p16(poly16_t a) {
   return vmovq_n_p16(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_f16(
 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
 // CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
 // CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
@@ -10173,7 +8553,7 @@ float16x8_t test_vmovq_n_f16(float16_t *a) {
   return vmovq_n_f16(*a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(float %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
@@ -10183,7 +8563,7 @@ float32x4_t test_vmovq_n_f32(float32_t a) {
   return vmovq_n_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vmov_n_s64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
 // CHECK:   ret <1 x i64> [[ADD_I]]
@@ -10192,7 +8572,7 @@ int64x1_t test_vmov_n_s64(int64_t a) {
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vmov_n_u64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
 // CHECK:   ret <1 x i64> [[ADD_I]]
@@ -10201,7 +8581,7 @@ uint64x1_t test_vmov_n_u64(uint64_t a) {
   return vadd_u64(tmp, tmp);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(i64 %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_s64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
@@ -10209,7 +8589,7 @@ int64x2_t test_vmovq_n_s64(int64_t a) {
   return vmovq_n_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(i64 %a) #0 {
+// CHECK-LABEL: @test_vmovq_n_u64(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
@@ -10217,294 +8597,264 @@ uint64x2_t test_vmovq_n_u64(uint64_t a) {
   return vmovq_n_u64(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmul_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
   return vmul_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmul_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
   return vmul_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmul_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
   return vmul_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmul_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
 // CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
   return vmul_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmul_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
   return vmul_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmul_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmul_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmulq_s8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
   return vmulq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmulq_s16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
   return vmulq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmulq_s32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
   return vmulq_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vmulq_f32(
 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
 // CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
   return vmulq_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmulq_u8(
 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
   return vmulq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmulq_u16(
 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
   return vmulq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmulq_u32(
 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
   return vmulq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmull_s8(
 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   return vmull_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmull_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   return vmull_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmull_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   return vmull_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmull_u8(
 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   return vmull_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmull_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmull_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmull_p8(
 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   return vmull_p8(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmull_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmull_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vmull_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmull_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmull_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_lane_u32(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK-LABEL: @test_vmull_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   return vmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmull_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   return vmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
+// CHECK-LABEL: @test_vmull_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   return vmull_n_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmull_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   return vmull_n_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmul_p8(
 // CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
   return vmul_p8(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vmulq_p8(
 // CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
   return vmulq_p8(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmul_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -10512,7 +8862,7 @@ int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
   return vmul_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmul_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -10520,7 +8870,7 @@ int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
   return vmul_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmul_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x float> [[MUL]]
@@ -10528,7 +8878,7 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
   return vmul_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmul_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i16> [[MUL]]
@@ -10536,7 +8886,7 @@ uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmul_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <2 x i32> [[MUL]]
@@ -10544,7 +8894,7 @@ uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmulq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -10552,7 +8902,7 @@ int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
   return vmulq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmulq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -10560,7 +8910,7 @@ int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
   return vmulq_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vmulq_lane_f32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x float> [[MUL]]
@@ -10568,7 +8918,7 @@ float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
   return vmulq_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vmulq_lane_u16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 // CHECK:   ret <8 x i16> [[MUL]]
@@ -10576,7 +8926,7 @@ uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
   return vmulq_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vmulq_lane_u32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 // CHECK:   ret <4 x i32> [[MUL]]
@@ -10584,8 +8934,7 @@ uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
   return vmulq_lane_u32(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK-LABEL: @test_vmul_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -10596,7 +8945,7 @@ int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   return vmul_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmul_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
@@ -10605,7 +8954,7 @@ int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   return vmul_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
+// CHECK-LABEL: @test_vmul_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
@@ -10614,7 +8963,7 @@ float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   return vmul_n_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
+// CHECK-LABEL: @test_vmul_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -10625,7 +8974,7 @@ uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   return vmul_n_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmul_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
@@ -10634,7 +8983,7 @@ uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   return vmul_n_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_s16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -10649,7 +8998,7 @@ int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   return vmulq_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_s32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
@@ -10660,7 +9009,7 @@ int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   return vmulq_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
@@ -10671,7 +9020,7 @@ float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   return vmulq_n_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 zeroext %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_u16(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -10686,7 +9035,7 @@ uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   return vmulq_n_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vmulq_n_u32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
@@ -10697,164 +9046,161 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   return vmulq_n_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvn_s8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
   return vmvn_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmvn_s16(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
   return vmvn_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmvn_s32(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
 // CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
   return vmvn_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvn_u8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   return vmvn_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmvn_u16(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   return vmvn_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmvn_u32(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
 // CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   return vmvn_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvn_p8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   return vmvn_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_s8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
   return vmvnq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_s16(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
   return vmvnq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_s32(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
   return vmvnq_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_u8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   return vmvnq_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_u16(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   return vmvnq_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_u32(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   return vmvnq_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vmvnq_p8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   return vmvnq_p8(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vneg_s8(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
 // CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
   return vneg_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vneg_s16(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
 // CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
   return vneg_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vneg_s32(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
 // CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
   return vneg_s32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vneg_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
 // CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
   return vneg_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vnegq_s8(
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
 // CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
   return vnegq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vnegq_s16(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
 // CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
   return vnegq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vnegq_s32(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
   return vnegq_s32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vnegq_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
 // CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
   return vnegq_f32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vorn_s8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
 // CHECK:   ret <8 x i8> [[OR_I]]
@@ -10862,7 +9208,7 @@ int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
   return vorn_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vorn_s16(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
 // CHECK:   ret <4 x i16> [[OR_I]]
@@ -10870,7 +9216,7 @@ int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
   return vorn_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vorn_s32(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
 // CHECK:   ret <2 x i32> [[OR_I]]
@@ -10878,7 +9224,7 @@ int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
   return vorn_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vorn_s64(
 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
 // CHECK:   ret <1 x i64> [[OR_I]]
@@ -10886,7 +9232,7 @@ int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
   return vorn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vorn_u8(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
 // CHECK:   ret <8 x i8> [[OR_I]]
@@ -10894,7 +9240,7 @@ uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
   return vorn_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vorn_u16(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
 // CHECK:   ret <4 x i16> [[OR_I]]
@@ -10902,7 +9248,7 @@ uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
   return vorn_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vorn_u32(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
 // CHECK:   ret <2 x i32> [[OR_I]]
@@ -10910,7 +9256,7 @@ uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
   return vorn_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vorn_u64(
 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
 // CHECK:   ret <1 x i64> [[OR_I]]
@@ -10918,7 +9264,7 @@ uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
   return vorn_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vornq_s8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
 // CHECK:   ret <16 x i8> [[OR_I]]
@@ -10926,7 +9272,7 @@ int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
   return vornq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vornq_s16(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
 // CHECK:   ret <8 x i16> [[OR_I]]
@@ -10934,7 +9280,7 @@ int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
   return vornq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vornq_s32(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
 // CHECK:   ret <4 x i32> [[OR_I]]
@@ -10942,7 +9288,7 @@ int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
   return vornq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vornq_s64(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
 // CHECK:   ret <2 x i64> [[OR_I]]
@@ -10950,7 +9296,7 @@ int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
   return vornq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vornq_u8(
 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
 // CHECK:   ret <16 x i8> [[OR_I]]
@@ -10958,7 +9304,7 @@ uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
   return vornq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vornq_u16(
 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
 // CHECK:   ret <8 x i16> [[OR_I]]
@@ -10966,7 +9312,7 @@ uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
   return vornq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vornq_u32(
 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
 // CHECK:   ret <4 x i32> [[OR_I]]
@@ -10974,7 +9320,7 @@ uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
   return vornq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vornq_u64(
 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
 // CHECK:   ret <2 x i64> [[OR_I]]
@@ -10982,891 +9328,751 @@ uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
   return vornq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vorr_s8(
 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
   return vorr_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vorr_s16(
 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
   return vorr_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vorr_s32(
 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
   return vorr_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vorr_s64(
 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
   return vorr_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vorr_u8(
 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
   return vorr_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vorr_u16(
 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
   return vorr_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vorr_u32(
 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
   return vorr_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vorr_u64(
 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
   return vorr_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vorrq_s8(
 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
   return vorrq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vorrq_s16(
 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
   return vorrq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vorrq_s32(
 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
   return vorrq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vorrq_s64(
 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
   return vorrq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vorrq_u8(
 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
   return vorrq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vorrq_u16(
 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
   return vorrq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vorrq_u32(
 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
   return vorrq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vorrq_u64(
 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
   return vorrq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadal_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
+// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
 // CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   return vpadal_s8(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadal_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
 // CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   return vpadal_s16(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadal_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
 // CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   return vpadal_s32(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadal_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
+// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
 // CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   return vpadal_u8(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadal_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
 // CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   return vpadal_u16(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadal_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
 // CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   return vpadal_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
+// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
 // CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   return vpadalq_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
 // CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   return vpadalq_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
 // CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   return vpadalq_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
+// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
 // CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   return vpadalq_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
 // CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   return vpadalq_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadalq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
 // CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   return vpadalq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadd_s8(
 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   return vpadd_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   return vpadd_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   return vpadd_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpadd_u8(
 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   return vpadd_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   return vpadd_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   return vpadd_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vpadd_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VPADD_V2_I]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   return vpadd_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_s8(
 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
   return vpaddl_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4
 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
   return vpaddl_s16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4
 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
   return vpaddl_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_u8(
 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   return vpaddl_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4
 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   return vpaddl_u16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vpaddl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4
 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   return vpaddl_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_s8(
 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   return vpaddlq_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4
 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   return vpaddlq_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4
 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   return vpaddlq_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_u8(
 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   return vpaddlq_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4
 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   return vpaddlq_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vpaddlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4
 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   return vpaddlq_u32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmax_s8(
 // CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPMAX_V_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   return vpmax_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmax_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VPMAX_V2_I]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   return vpmax_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmax_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VPMAX_V2_I]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   return vpmax_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmax_u8(
 // CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPMAX_V_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   return vpmax_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmax_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VPMAX_V2_I]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   return vpmax_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmax_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VPMAX_V2_I]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   return vpmax_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vpmax_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[VPMAX_V_I]], <2 x float> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VPMAX_V2_I]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   return vpmax_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmin_s8(
 // CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPMIN_V_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   return vpmin_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmin_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VPMIN_V2_I]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   return vpmin_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmin_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VPMIN_V2_I]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   return vpmin_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vpmin_u8(
 // CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VPMIN_V_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   return vpmin_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vpmin_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VPMIN_V2_I]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   return vpmin_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vpmin_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VPMIN_V2_I]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   return vpmin_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vpmin_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[VPMIN_V_I]], <2 x float> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VPMIN_V2_I]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   return vpmin_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqabs_s8(
 // CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
   return vqabs_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqabs_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4
 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQABS_V1_I]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
   return vqabs_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqabs_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4
 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQABS_V1_I]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
   return vqabs_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqabsq_s8(
 // CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
   return vqabsq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqabsq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #4
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4
 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VQABSQ_V1_I]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
   return vqabsq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqabsq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #4
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4
 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VQABSQ_V1_I]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
   return vqabsq_s32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqadd_s8(
 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   return vqadd_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   return vqadd_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqadd_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   return vqadd_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqadd_u8(
 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   return vqadd_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   return vqadd_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqadd_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   return vqadd_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_s8(
 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   return vqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   return vqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   return vqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_u8(
 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqaddq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vqaddq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_lane_s32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -11874,94 +10080,73 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vqdmlal_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
-// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlal_n_s32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_lane_s32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -11969,176 +10154,137 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #4
 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK-LABEL: @test_vqdmlsl_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
-// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #4
 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlsl_n_s32(a, b, c);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqdmulhq_s32(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqdmulhq_lane_s32(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #4
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQDMULH_V5_I]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmulh_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQDMULH_V3_I]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
@@ -12149,420 +10295,334 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #4
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #4
 // CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V9_I]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmulhq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #4
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #4
 // CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V5_I]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqdmulhq_n_s32(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmull_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqdmull_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_s32(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqdmull_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqdmull_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_lane_s32(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK-LABEL: @test_vqdmull_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #4
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQDMULL_V5_I]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   return vqdmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqdmull_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQDMULL_V3_I]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   return vqdmull_n_s32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4
 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
   return vqmovn_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4
 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
   return vqmovn_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4
 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
   return vqmovn_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4
 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   return vqmovn_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4
 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   return vqmovn_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqmovn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4
 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   return vqmovn_u64(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqmovun_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4
 // CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 uint8x8_t test_vqmovun_s16(int16x8_t a) {
   return vqmovun_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqmovun_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4
 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQMOVUN_V1_I]]
 uint16x4_t test_vqmovun_s32(int32x4_t a) {
   return vqmovun_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqmovun_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4
 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQMOVUN_V1_I]]
 uint32x2_t test_vqmovun_s64(int64x2_t a) {
   return vqmovun_s64(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqneg_s8(
 // CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
 // CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
   return vqneg_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqneg_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4
 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK:   ret <4 x i16> [[VQNEG_V1_I]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
   return vqneg_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqneg_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4
 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK:   ret <2 x i32> [[VQNEG_V1_I]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
   return vqneg_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqnegq_s8(
 // CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
 // CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
   return vqnegq_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqnegq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #4
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4
 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK:   ret <8 x i16> [[VQNEGQ_V1_I]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
   return vqnegq_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqnegq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #4
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4
 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK:   ret <4 x i32> [[VQNEGQ_V1_I]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
   return vqnegq_s32(a);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqrdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #4
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqrdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #4
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqrdmulhq_lane_s32(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #4
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
 // CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRDMULH_V5_I]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqrdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqrdmulh_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRDMULH_V3_I]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqrdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
@@ -12573,220 +10633,176 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #4
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #4
 // CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V9_I]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqrdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK-LABEL: @test_vqrdmulhq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #4
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #4
 // CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V5_I]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqrdmulhq_n_s32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_s8(
 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   return vqrshl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   return vqrshl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   return vqrshl_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   return vqrshl_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_u8(
 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   return vqrshl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   return vqrshl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return vqrshl_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   return vqrshl_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_s8(
 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   return vqrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   return vqrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   return vqrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   return vqrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_u8(
 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqrshlq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqrshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -12795,7 +10811,7 @@ int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   return vqrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -12804,7 +10820,7 @@ int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   return vqrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -12813,7 +10829,7 @@ int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   return vqrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -12822,7 +10838,7 @@ uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   return vqrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -12831,7 +10847,7 @@ uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   return vqrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqrshrn_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -12840,8 +10856,7 @@ uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   return vqrshrn_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqrshrun_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -12850,7 +10865,7 @@ uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   return vqrshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqrshrun_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -12859,7 +10874,7 @@ uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   return vqrshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqrshrun_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -12868,200 +10883,162 @@ uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   return vqrshrun_n_s64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqshl_s8(
 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   return vqshl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   return vqshl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   return vqshl_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   return vqshl_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqshl_u8(
 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   return vqshl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   return vqshl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   return vqshl_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   return vqshl_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_s8(
 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   return vqshlq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   return vqshlq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   return vqshlq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   return vqshlq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_u8(
 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqshlq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqshlq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqshlq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqshlq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshlu_n_s8(
 // CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <8 x i8> [[VQSHLU_N]]
 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
   return vqshlu_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshlu_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
@@ -13070,7 +11047,7 @@ uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
   return vqshlu_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshlu_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
@@ -13079,7 +11056,7 @@ uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
   return vqshlu_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshlu_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
@@ -13088,14 +11065,14 @@ uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   return vqshlu_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshluq_n_s8(
 // CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <16 x i8> [[VQSHLU_N]]
 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
   return vqshluq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshluq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
@@ -13104,7 +11081,7 @@ uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
   return vqshluq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshluq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -13113,7 +11090,7 @@ uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
   return vqshluq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshluq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
@@ -13122,15 +11099,14 @@ uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
   return vqshluq_n_s64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_s8(
 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   return vqshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
@@ -13139,7 +11115,7 @@ int16x4_t test_vqshl_n_s16(int16x4_t a) {
   return vqshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
@@ -13148,7 +11124,7 @@ int32x2_t test_vqshl_n_s32(int32x2_t a) {
   return vqshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
@@ -13157,14 +11133,14 @@ int64x1_t test_vqshl_n_s64(int64x1_t a) {
   return vqshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_u8(
 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   return vqshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
@@ -13173,7 +11149,7 @@ uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   return vqshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
@@ -13182,7 +11158,7 @@ uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   return vqshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshl_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
@@ -13191,14 +11167,14 @@ uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   return vqshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_s8(
 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   return vqshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
@@ -13207,7 +11183,7 @@ int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   return vqshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -13216,7 +11192,7 @@ int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   return vqshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
@@ -13225,14 +11201,14 @@ int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   return vqshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_u8(
 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   return vqshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
@@ -13241,7 +11217,7 @@ uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   return vqshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -13250,7 +11226,7 @@ uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   return vqshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshlq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
@@ -13259,8 +11235,7 @@ uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   return vqshlq_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -13269,7 +11244,7 @@ int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   return vqshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -13278,7 +11253,7 @@ int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   return vqshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -13287,7 +11262,7 @@ int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   return vqshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -13296,7 +11271,7 @@ uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   return vqshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -13305,7 +11280,7 @@ uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   return vqshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshrn_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -13314,8 +11289,7 @@ uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   return vqshrn_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vqshrun_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -13324,7 +11298,7 @@ uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
   return vqshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vqshrun_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -13333,7 +11307,7 @@ uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
   return vqshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vqshrun_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -13342,2722 +11316,2589 @@ uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
   return vqshrun_n_s64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqsub_s8(
 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqsub_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   return vqsub_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqsub_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   return vqsub_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqsub_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   return vqsub_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqsub_u8(
 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqsub_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   return vqsub_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqsub_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   return vqsub_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqsub_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   return vqsub_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_s8(
 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   return vqsubq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   return vqsubq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   return vqsubq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_u8(
 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vqsubq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vqsubq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vqsubq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vqsubq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   return vraddhn_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   return vraddhn_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   return vraddhn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vraddhn_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vraddhn_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vraddhn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vraddhn_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrecpe_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #4
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4
 // CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
   return vrecpe_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrecpe_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #4
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4
 // CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   return vrecpe_u32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrecpeq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #4
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4
 // CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   return vrecpeq_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrecpeq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #4
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4
 // CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   return vrecpeq_u32(a);
 }
 
-
-// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vrecps_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VRECPS_V2_I]]
 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
   return vrecps_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vrecpsq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   ret <4 x float> [[VRECPSQ_V2_I]]
 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
   return vrecpsq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_u8(
 // CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_p8(
 // CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_u16(
 // CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s16_p16(
 // CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_u32(
 // CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_u64(
 // CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_s64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_s8(
 // CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_p8(
 // CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_s16(
 // CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u16_p16(
 // CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_s32(
 // CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
 // CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_s64(
 // CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_u64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
 // CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f16_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
 // CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_f32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
 // CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_s8(
 // CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_u8(
 // CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_s16(
 // CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_u16(
 // CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpret_p16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
 // CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_u8(
 // CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_p8(
 // CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_u16(
 // CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s16_p16(
 // CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_u32(
 // CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_u64(
 // CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_s64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_s8(
 // CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_p8(
 // CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_s16(
 // CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u16_p16(
 // CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_s32(
 // CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
 // CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_s64(
 // CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_u64_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
 // CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f16_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
 // CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_f32_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
 // CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_s8(
 // CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_u8(
 // CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p8_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_s16(
 // CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_u16(
 // CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vreinterpretq_p16_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
 // CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev16q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev32q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrev64_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrev64_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrev64_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_s8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_s16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_s32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_u8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_u16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_u32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_p8(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_p16(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrev64q_f32(
 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrhadd_s8(
 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
   return vrhadd_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrhadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
   return vrhadd_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrhadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
   return vrhadd_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrhadd_u8(
 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vrhadd_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrhadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vrhadd_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrhadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vrhadd_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrhaddq_s8(
 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
   return vrhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrhaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
   return vrhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrhaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
   return vrhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrhaddq_u8(
 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vrhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrhaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vrhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrhaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vrhaddq_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrshl_s8(
 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   return vrshl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   return vrshl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   return vrshl_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   return vrshl_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrshl_u8(
 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   return vrshl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   return vrshl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   return vrshl_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   return vrshl_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_s8(
 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   return vrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   return vrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   return vrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   return vrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_u8(
 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrshlq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vrshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -16066,7 +13907,7 @@ int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   return vrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -16075,7 +13916,7 @@ int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   return vrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -16084,7 +13925,7 @@ int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   return vrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -16093,7 +13934,7 @@ uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   return vrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -16102,7 +13943,7 @@ uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   return vrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshrn_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -16111,15 +13952,14 @@ uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   return vrshrn_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_s8(
 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   return vrshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -16128,7 +13968,7 @@ int16x4_t test_vrshr_n_s16(int16x4_t a) {
   return vrshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
@@ -16137,7 +13977,7 @@ int32x2_t test_vrshr_n_s32(int32x2_t a) {
   return vrshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
@@ -16146,14 +13986,14 @@ int64x1_t test_vrshr_n_s64(int64x1_t a) {
   return vrshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_u8(
 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <8 x i8> [[VRSHR_N]]
 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
   return vrshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -16162,7 +14002,7 @@ uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
   return vrshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
@@ -16171,7 +14011,7 @@ uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
   return vrshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshr_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
@@ -16180,14 +14020,14 @@ uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   return vrshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_s8(
 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   return vrshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -16196,7 +14036,7 @@ int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   return vrshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -16205,7 +14045,7 @@ int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   return vrshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -16214,14 +14054,14 @@ int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   return vrshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_u8(
 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <16 x i8> [[VRSHR_N]]
 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
   return vrshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
@@ -16230,7 +14070,7 @@ uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
   return vrshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -16239,7 +14079,7 @@ uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
   return vrshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vrshrq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
@@ -16248,72 +14088,59 @@ uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
   return vrshrq_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+// CHECK-LABEL: @test_vrsqrte_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #4
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4
 // CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
   return vrsqrte_f32(a);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrsqrte_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a) #4
 // CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   return vrsqrte_u32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+// CHECK-LABEL: @test_vrsqrteq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4
 // CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
   return vrsqrteq_f32(a);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vrsqrteq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a) #4
 // CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   return vrsqrteq_u32(a);
 }
 
-
-// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vrsqrts_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %a, <2 x float> %b) #4
 // CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK:   ret <2 x float> [[VRSQRTS_V2_I]]
 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
   return vrsqrts_f32(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vrsqrtsq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %a, <4 x float> %b) #4
 // CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK:   ret <4 x float> [[VRSQRTSQ_V2_I]]
 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
   return vrsqrtsq_f32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_s8(
 // CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
 // CHECK:   ret <8 x i8> [[VRSRA_N]]
@@ -16321,7 +14148,7 @@ int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   return vrsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -16333,7 +14160,7 @@ int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   return vrsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -16345,7 +14172,7 @@ int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   return vrsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -16357,7 +14184,7 @@ int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   return vrsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_u8(
 // CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
 // CHECK:   ret <8 x i8> [[VRSRA_N]]
@@ -16365,7 +14192,7 @@ uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vrsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -16377,7 +14204,7 @@ uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vrsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -16389,7 +14216,7 @@ uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vrsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsra_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -16401,7 +14228,7 @@ uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vrsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_s8(
 // CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
 // CHECK:   ret <16 x i8> [[VRSRA_N]]
@@ -16409,7 +14236,7 @@ int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vrsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -16421,7 +14248,7 @@ int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vrsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -16433,7 +14260,7 @@ int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vrsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -16445,7 +14272,7 @@ int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vrsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_u8(
 // CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
 // CHECK:   ret <16 x i8> [[VRSRA_N]]
@@ -16453,7 +14280,7 @@ uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vrsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -16465,7 +14292,7 @@ uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vrsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -16477,7 +14304,7 @@ uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vrsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsraq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -16489,90 +14316,72 @@ uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vrsraq_n_u64(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   return vrsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   return vrsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   return vrsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vrsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vrsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vrsubhn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vrsubhn_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 zeroext %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_u8(
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
 // CHECK:   ret <8 x i8> [[VSET_LANE]]
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
   return vset_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 zeroext %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
@@ -16581,7 +14390,7 @@ uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
   return vset_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
@@ -16590,14 +14399,14 @@ uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
   return vset_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 signext %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_s8(
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
 // CHECK:   ret <8 x i8> [[VSET_LANE]]
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
   return vset_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 signext %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
@@ -16606,7 +14415,7 @@ int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
   return vset_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
@@ -16615,14 +14424,14 @@ int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
   return vset_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 signext %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_p8(
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
 // CHECK:   ret <8 x i8> [[VSET_LANE]]
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
   return vset_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 signext %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
@@ -16631,7 +14440,7 @@ poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
   return vset_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
@@ -16640,7 +14449,7 @@ float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
   return vset_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_f16(
 // CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
 // CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
 // CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
@@ -16662,14 +14471,14 @@ float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
   return vset_lane_f16(*a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 zeroext %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_u8(
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
 // CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
   return vsetq_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 zeroext %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
@@ -16678,7 +14487,7 @@ uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
   return vsetq_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
@@ -16687,14 +14496,14 @@ uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
   return vsetq_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 signext %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_s8(
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
 // CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
   return vsetq_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 signext %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
@@ -16703,7 +14512,7 @@ int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
   return vsetq_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
@@ -16712,14 +14521,14 @@ int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
   return vsetq_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 signext %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_p8(
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
 // CHECK:   ret <16 x i8> [[VSET_LANE]]
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
   return vsetq_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 signext %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
@@ -16728,7 +14537,7 @@ poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
   return vsetq_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
@@ -16737,7 +14546,7 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
   return vsetq_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_f16(
 // CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
 // CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
 // CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
@@ -16759,8 +14568,7 @@ float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   return vsetq_lane_f16(*a, b, 3);
 }
 
-// The optimizer is able to get rid of all moves now.
-// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
@@ -16769,8 +14577,7 @@ int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
   return vset_lane_s64(a, b, 0);
 }
 
-// The optimizer is able to get rid of all moves now.
-// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vset_lane_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
@@ -16779,7 +14586,7 @@ uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
   return vset_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
@@ -16788,7 +14595,7 @@ int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
   return vsetq_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsetq_lane_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
@@ -16797,193 +14604,155 @@ uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
   return vsetq_lane_u64(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vshl_s8(
 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   return vshl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   return vshl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   return vshl_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshl_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   return vshl_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vshl_u8(
 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   return vshl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   return vshl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   return vshl_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshl_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   return vshl_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vshlq_s8(
 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   return vshlq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshlq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   return vshlq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshlq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   return vshlq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshlq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   return vshlq_s64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vshlq_u8(
 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   return vshlq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vshlq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   return vshlq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vshlq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   return vshlq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vshlq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   return vshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s8(
 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
 // CHECK:   ret <8 x i16> [[VSHLL_N]]
@@ -16991,7 +14760,7 @@ int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
@@ -17001,7 +14770,7 @@ int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
@@ -17011,7 +14780,7 @@ int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u8(
 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
 // CHECK:   ret <8 x i16> [[VSHLL_N]]
@@ -17019,7 +14788,7 @@ uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
@@ -17029,7 +14798,7 @@ uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshll_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
@@ -17039,15 +14808,14 @@ uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_s8(
 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_s8(int8x8_t a) {
   return vshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
@@ -17056,7 +14824,7 @@ int16x4_t test_vshl_n_s16(int16x4_t a) {
   return vshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
@@ -17065,7 +14833,7 @@ int32x2_t test_vshl_n_s32(int32x2_t a) {
   return vshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
@@ -17074,14 +14842,14 @@ int64x1_t test_vshl_n_s64(int64x1_t a) {
   return vshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_u8(
 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   ret <8 x i8> [[VSHL_N]]
 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
   return vshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
@@ -17090,7 +14858,7 @@ uint16x4_t test_vshl_n_u16(uint16x4_t a) {
   return vshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
@@ -17099,7 +14867,7 @@ uint32x2_t test_vshl_n_u32(uint32x2_t a) {
   return vshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshl_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
@@ -17108,14 +14876,14 @@ uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   return vshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_s8(
 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   return vshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -17124,7 +14892,7 @@ int16x8_t test_vshlq_n_s16(int16x8_t a) {
   return vshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
@@ -17133,7 +14901,7 @@ int32x4_t test_vshlq_n_s32(int32x4_t a) {
   return vshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
@@ -17142,14 +14910,14 @@ int64x2_t test_vshlq_n_s64(int64x2_t a) {
   return vshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_u8(
 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   ret <16 x i8> [[VSHL_N]]
 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
   return vshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -17158,7 +14926,7 @@ uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
   return vshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
@@ -17167,7 +14935,7 @@ uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
   return vshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshlq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
@@ -17176,8 +14944,7 @@ uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
   return vshlq_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -17187,7 +14954,7 @@ int8x8_t test_vshrn_n_s16(int16x8_t a) {
   return vshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
@@ -17197,7 +14964,7 @@ int16x4_t test_vshrn_n_s32(int32x4_t a) {
   return vshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
@@ -17207,7 +14974,7 @@ int32x2_t test_vshrn_n_s64(int64x2_t a) {
   return vshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -17217,7 +14984,7 @@ uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   return vshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
@@ -17227,7 +14994,7 @@ uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   return vshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshrn_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
@@ -17237,15 +15004,14 @@ uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   return vshrn_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_s8(
 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_s8(int8x8_t a) {
   return vshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
@@ -17254,7 +15020,7 @@ int16x4_t test_vshr_n_s16(int16x4_t a) {
   return vshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
@@ -17263,7 +15029,7 @@ int32x2_t test_vshr_n_s32(int32x2_t a) {
   return vshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
@@ -17272,14 +15038,14 @@ int64x1_t test_vshr_n_s64(int64x1_t a) {
   return vshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_u8(
 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   ret <8 x i8> [[VSHR_N]]
 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
   return vshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
@@ -17288,7 +15054,7 @@ uint16x4_t test_vshr_n_u16(uint16x4_t a) {
   return vshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
@@ -17297,7 +15063,7 @@ uint32x2_t test_vshr_n_u32(uint32x2_t a) {
   return vshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshr_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
 // CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
@@ -17306,14 +15072,14 @@ uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   return vshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_s8(
 // CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   return vshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -17322,7 +15088,7 @@ int16x8_t test_vshrq_n_s16(int16x8_t a) {
   return vshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
@@ -17331,7 +15097,7 @@ int32x4_t test_vshrq_n_s32(int32x4_t a) {
   return vshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
@@ -17340,14 +15106,14 @@ int64x2_t test_vshrq_n_s64(int64x2_t a) {
   return vshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_u8(
 // CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   ret <16 x i8> [[VSHR_N]]
 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
   return vshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -17356,7 +15122,7 @@ uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
   return vshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
@@ -17365,7 +15131,7 @@ uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
   return vshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK-LABEL: @test_vshrq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
@@ -17374,15 +15140,14 @@ uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
   return vshrq_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_s8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   return vsli_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -17393,7 +15158,7 @@ int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   return vsli_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -17404,7 +15169,7 @@ int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   return vsli_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -17415,14 +15180,14 @@ int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   return vsli_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_u8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsli_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -17433,7 +15198,7 @@ uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsli_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -17444,7 +15209,7 @@ uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsli_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -17455,14 +15220,14 @@ uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsli_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_p8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsli_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsli_n_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -17473,14 +15238,14 @@ poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsli_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_s8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   return vsliq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -17491,7 +15256,7 @@ int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   return vsliq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -17502,7 +15267,7 @@ int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   return vsliq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -17513,14 +15278,14 @@ int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   return vsliq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_u8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsliq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -17531,7 +15296,7 @@ uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsliq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -17542,7 +15307,7 @@ uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsliq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -17553,14 +15318,14 @@ uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsliq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_p8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsliq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsliq_n_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -17571,8 +15336,7 @@ poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsliq_n_p16(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_s8(
 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
 // CHECK:   ret <8 x i8> [[TMP0]]
@@ -17580,7 +15344,7 @@ int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   return vsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -17592,7 +15356,7 @@ int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   return vsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -17604,7 +15368,7 @@ int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   return vsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -17616,7 +15380,7 @@ int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   return vsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_u8(
 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
 // CHECK:   ret <8 x i8> [[TMP0]]
@@ -17624,7 +15388,7 @@ uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -17636,7 +15400,7 @@ uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -17648,7 +15412,7 @@ uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsra_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -17660,7 +15424,7 @@ uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_s8(
 // CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
 // CHECK:   ret <16 x i8> [[TMP0]]
@@ -17668,7 +15432,7 @@ int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -17680,7 +15444,7 @@ int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -17692,7 +15456,7 @@ int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -17704,7 +15468,7 @@ int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_u8(
 // CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
 // CHECK:   ret <16 x i8> [[TMP0]]
@@ -17712,7 +15476,7 @@ uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -17724,7 +15488,7 @@ uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -17736,7 +15500,7 @@ uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsraq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -17748,15 +15512,14 @@ uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsraq_n_u64(a, b, 1);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_s8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   return vsri_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -17767,7 +15530,7 @@ int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   return vsri_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -17778,7 +15541,7 @@ int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   return vsri_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -17789,14 +15552,14 @@ int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   return vsri_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_u8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsri_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -17807,7 +15570,7 @@ uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsri_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
@@ -17818,7 +15581,7 @@ uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsri_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
@@ -17829,14 +15592,14 @@ uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsri_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_p8(
 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsri_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsri_n_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
@@ -17847,14 +15610,14 @@ poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsri_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_s8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   return vsriq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -17865,7 +15628,7 @@ int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   return vsriq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -17876,7 +15639,7 @@ int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   return vsriq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -17887,14 +15650,14 @@ int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   return vsriq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_u8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsriq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -17905,7 +15668,7 @@ uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsriq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
@@ -17916,7 +15679,7 @@ uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsriq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
@@ -17927,14 +15690,14 @@ uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsriq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_p8(
 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 // CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsriq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsriq_n_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
@@ -17945,15 +15708,14 @@ poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsriq_n_p16(a, b, 1);
 }
 
-
-// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_u8(
 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
 // CHECK:   ret void
 void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
   vst1q_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -17963,7 +15725,7 @@ void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
   vst1q_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1q_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
@@ -17973,7 +15735,7 @@ void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
   vst1q_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1q_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
@@ -17983,14 +15745,14 @@ void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
   vst1q_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_s8(
 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
 // CHECK:   ret void
 void test_vst1q_s8(int8_t * a, int8x16_t b) {
   vst1q_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -18000,7 +15762,7 @@ void test_vst1q_s16(int16_t * a, int16x8_t b) {
   vst1q_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1q_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
@@ -18010,7 +15772,7 @@ void test_vst1q_s32(int32_t * a, int32x4_t b) {
   vst1q_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1q_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
@@ -18020,7 +15782,7 @@ void test_vst1q_s64(int64_t * a, int64x2_t b) {
   vst1q_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
+// CHECK-LABEL: @test_vst1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -18030,7 +15792,7 @@ void test_vst1q_f16(float16_t * a, float16x8_t b) {
   vst1q_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vst1q_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
@@ -18040,14 +15802,14 @@ void test_vst1q_f32(float32_t * a, float32x4_t b) {
   vst1q_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_p8(
 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
 // CHECK:   ret void
 void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
   vst1q_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -18057,14 +15819,14 @@ void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
   vst1q_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_u8(
 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
 // CHECK:   ret void
 void test_vst1_u8(uint8_t * a, uint8x8_t b) {
   vst1_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -18074,7 +15836,7 @@ void test_vst1_u16(uint16_t * a, uint16x4_t b) {
   vst1_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
@@ -18084,7 +15846,7 @@ void test_vst1_u32(uint32_t * a, uint32x2_t b) {
   vst1_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
@@ -18094,14 +15856,14 @@ void test_vst1_u64(uint64_t * a, uint64x1_t b) {
   vst1_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_s8(
 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
 // CHECK:   ret void
 void test_vst1_s8(int8_t * a, int8x8_t b) {
   vst1_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -18111,7 +15873,7 @@ void test_vst1_s16(int16_t * a, int16x4_t b) {
   vst1_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
@@ -18121,7 +15883,7 @@ void test_vst1_s32(int32_t * a, int32x2_t b) {
   vst1_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
@@ -18131,7 +15893,7 @@ void test_vst1_s64(int64_t * a, int64x1_t b) {
   vst1_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
+// CHECK-LABEL: @test_vst1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -18141,7 +15903,7 @@ void test_vst1_f16(float16_t * a, float16x4_t b) {
   vst1_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vst1_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
@@ -18151,14 +15913,14 @@ void test_vst1_f32(float32_t * a, float32x2_t b) {
   vst1_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_p8(
 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
 // CHECK:   ret void
 void test_vst1_p8(poly8_t * a, poly8x8_t b) {
   vst1_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -18168,8 +15930,7 @@ void test_vst1_p16(poly16_t * a, poly16x4_t b) {
   vst1_p16(a, b);
 }
 
-
-// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_u8(
 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
 // CHECK:   ret void
@@ -18177,7 +15938,7 @@ void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
   vst1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -18189,7 +15950,7 @@ void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
   vst1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
@@ -18201,7 +15962,7 @@ void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
   vst1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
@@ -18212,7 +15973,7 @@ void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
   vst1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_s8(
 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
 // CHECK:   ret void
@@ -18220,7 +15981,7 @@ void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
   vst1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -18232,7 +15993,7 @@ void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
   vst1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
@@ -18244,7 +16005,7 @@ void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
   vst1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
@@ -18255,7 +16016,7 @@ void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
   vst1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -18267,7 +16028,7 @@ void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
@@ -18279,7 +16040,7 @@ void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
   vst1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_p8(
 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
 // CHECK:   ret void
@@ -18287,7 +16048,7 @@ void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
   vst1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1q_lane_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
@@ -18299,7 +16060,7 @@ void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
   vst1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_u8(
 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
 // CHECK:   ret void
@@ -18307,7 +16068,7 @@ void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
   vst1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -18319,7 +16080,7 @@ void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
   vst1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
@@ -18331,7 +16092,7 @@ void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
   vst1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
@@ -18343,7 +16104,7 @@ void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
   vst1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_s8(
 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
 // CHECK:   ret void
@@ -18351,7 +16112,7 @@ void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
   vst1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -18363,7 +16124,7 @@ void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
   vst1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
@@ -18375,7 +16136,7 @@ void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
   vst1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
@@ -18387,7 +16148,7 @@ void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
   vst1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -18399,7 +16160,7 @@ void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
@@ -18411,7 +16172,7 @@ void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
   vst1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_p8(
 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
 // CHECK:   ret void
@@ -18419,7 +16180,7 @@ void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
   vst1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vst1_lane_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
@@ -18431,8 +16192,7 @@ void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
   vst1_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
@@ -18453,7 +16213,7 @@ void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
   vst2q_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
@@ -18479,7 +16239,7 @@ void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
@@ -18505,7 +16265,7 @@ void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
@@ -18526,7 +16286,7 @@ void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
   vst2q_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
@@ -18552,7 +16312,7 @@ void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
   vst2q_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
@@ -18578,7 +16338,7 @@ void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
   vst2q_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
@@ -18604,7 +16364,7 @@ void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
   vst2q_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
@@ -18630,7 +16390,7 @@ void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
   vst2q_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
@@ -18651,7 +16411,7 @@ void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
   vst2q_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
@@ -18677,7 +16437,7 @@ void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
@@ -18698,7 +16458,7 @@ void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
@@ -18724,7 +16484,7 @@ void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
@@ -18750,7 +16510,7 @@ void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
@@ -18776,7 +16536,7 @@ void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
   vst2_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
@@ -18797,7 +16557,7 @@ void test_vst2_s8(int8_t * a, int8x8x2_t b) {
   vst2_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
@@ -18823,7 +16583,7 @@ void test_vst2_s16(int16_t * a, int16x4x2_t b) {
   vst2_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
@@ -18849,7 +16609,7 @@ void test_vst2_s32(int32_t * a, int32x2x2_t b) {
   vst2_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
@@ -18875,7 +16635,7 @@ void test_vst2_s64(int64_t * a, int64x1x2_t b) {
   vst2_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
@@ -18901,7 +16661,7 @@ void test_vst2_f16(float16_t * a, float16x4x2_t b) {
   vst2_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
@@ -18927,7 +16687,7 @@ void test_vst2_f32(float32_t * a, float32x2x2_t b) {
   vst2_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
@@ -18948,7 +16708,7 @@ void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
@@ -18974,8 +16734,7 @@ void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_p16(a, b);
 }
 
-
-// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
@@ -19001,7 +16760,7 @@ void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
@@ -19027,7 +16786,7 @@ void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
@@ -19053,7 +16812,7 @@ void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
   vst2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
@@ -19079,7 +16838,7 @@ void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
   vst2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
@@ -19105,7 +16864,7 @@ void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
@@ -19131,7 +16890,7 @@ void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
   vst2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2q_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
@@ -19157,7 +16916,7 @@ void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
@@ -19178,7 +16937,7 @@ void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
@@ -19204,7 +16963,7 @@ void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
@@ -19230,7 +16989,7 @@ void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
@@ -19251,7 +17010,7 @@ void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
   vst2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
@@ -19277,7 +17036,7 @@ void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
   vst2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
@@ -19303,7 +17062,7 @@ void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
   vst2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
@@ -19329,7 +17088,7 @@ void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
@@ -19355,7 +17114,7 @@ void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
   vst2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
@@ -19376,7 +17135,7 @@ void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst2_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
@@ -19402,8 +17161,7 @@ void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
@@ -19427,7 +17185,7 @@ void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
   vst3q_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
@@ -19458,7 +17216,7 @@ void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
@@ -19489,7 +17247,7 @@ void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
@@ -19513,7 +17271,7 @@ void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
   vst3q_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
@@ -19544,7 +17302,7 @@ void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
   vst3q_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
@@ -19575,7 +17333,7 @@ void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
   vst3q_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
@@ -19606,7 +17364,7 @@ void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
   vst3q_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
@@ -19637,7 +17395,7 @@ void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
   vst3q_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
@@ -19661,7 +17419,7 @@ void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
   vst3q_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
@@ -19692,7 +17450,7 @@ void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
@@ -19716,7 +17474,7 @@ void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
@@ -19747,7 +17505,7 @@ void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
@@ -19778,7 +17536,7 @@ void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
@@ -19809,7 +17567,7 @@ void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
   vst3_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
@@ -19833,7 +17591,7 @@ void test_vst3_s8(int8_t * a, int8x8x3_t b) {
   vst3_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
@@ -19864,7 +17622,7 @@ void test_vst3_s16(int16_t * a, int16x4x3_t b) {
   vst3_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
@@ -19895,7 +17653,7 @@ void test_vst3_s32(int32_t * a, int32x2x3_t b) {
   vst3_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
@@ -19926,7 +17684,7 @@ void test_vst3_s64(int64_t * a, int64x1x3_t b) {
   vst3_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
@@ -19957,7 +17715,7 @@ void test_vst3_f16(float16_t * a, float16x4x3_t b) {
   vst3_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
@@ -19988,7 +17746,7 @@ void test_vst3_f32(float32_t * a, float32x2x3_t b) {
   vst3_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
@@ -20012,7 +17770,7 @@ void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
@@ -20043,8 +17801,7 @@ void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_p16(a, b);
 }
 
-
-// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
@@ -20075,7 +17832,7 @@ void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
@@ -20106,7 +17863,7 @@ void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
@@ -20137,7 +17894,7 @@ void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
   vst3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
@@ -20168,7 +17925,7 @@ void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
   vst3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
@@ -20199,7 +17956,7 @@ void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
@@ -20230,7 +17987,7 @@ void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
   vst3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3q_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
@@ -20261,7 +18018,7 @@ void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
@@ -20285,7 +18042,7 @@ void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
@@ -20316,7 +18073,7 @@ void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
@@ -20347,7 +18104,7 @@ void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
@@ -20371,7 +18128,7 @@ void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
   vst3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
@@ -20402,7 +18159,7 @@ void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
   vst3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
@@ -20433,7 +18190,7 @@ void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
   vst3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
@@ -20464,7 +18221,7 @@ void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
@@ -20495,7 +18252,7 @@ void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
   vst3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
@@ -20519,7 +18276,7 @@ void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst3_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
@@ -20550,8 +18307,7 @@ void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
@@ -20578,7 +18334,7 @@ void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
   vst4q_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
@@ -20614,7 +18370,7 @@ void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
@@ -20650,7 +18406,7 @@ void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
@@ -20677,7 +18433,7 @@ void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
   vst4q_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
@@ -20713,7 +18469,7 @@ void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
   vst4q_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
@@ -20749,7 +18505,7 @@ void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
   vst4q_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
@@ -20785,7 +18541,7 @@ void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
   vst4q_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
@@ -20821,7 +18577,7 @@ void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
   vst4q_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
@@ -20848,7 +18604,7 @@ void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
   vst4q_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
@@ -20884,7 +18640,7 @@ void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
@@ -20911,7 +18667,7 @@ void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
@@ -20947,7 +18703,7 @@ void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
@@ -20983,7 +18739,7 @@ void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_u64(
 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
@@ -21019,7 +18775,7 @@ void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
   vst4_u64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
@@ -21046,7 +18802,7 @@ void test_vst4_s8(int8_t * a, int8x8x4_t b) {
   vst4_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
@@ -21082,7 +18838,7 @@ void test_vst4_s16(int16_t * a, int16x4x4_t b) {
   vst4_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
@@ -21118,7 +18874,7 @@ void test_vst4_s32(int32_t * a, int32x2x4_t b) {
   vst4_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_s64(
 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
@@ -21154,7 +18910,7 @@ void test_vst4_s64(int64_t * a, int64x1x4_t b) {
   vst4_s64(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
@@ -21190,7 +18946,7 @@ void test_vst4_f16(float16_t * a, float16x4x4_t b) {
   vst4_f16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
@@ -21226,7 +18982,7 @@ void test_vst4_f32(float32_t * a, float32x2x4_t b) {
   vst4_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
@@ -21253,7 +19009,7 @@ void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
@@ -21289,8 +19045,7 @@ void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_p16(a, b);
 }
 
-
-// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
@@ -21326,7 +19081,7 @@ void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
@@ -21362,7 +19117,7 @@ void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
@@ -21398,7 +19153,7 @@ void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
   vst4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
@@ -21434,7 +19189,7 @@ void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
   vst4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
@@ -21470,7 +19225,7 @@ void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
@@ -21506,7 +19261,7 @@ void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
   vst4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4q_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
@@ -21542,7 +19297,7 @@ void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_u8(
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
@@ -21569,7 +19324,7 @@ void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
@@ -21605,7 +19360,7 @@ void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_u32(
 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
@@ -21641,7 +19396,7 @@ void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_s8(
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
@@ -21668,7 +19423,7 @@ void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
   vst4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_s16(
 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
@@ -21704,7 +19459,7 @@ void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
   vst4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_s32(
 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
@@ -21740,7 +19495,7 @@ void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
   vst4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_f16(
 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
@@ -21776,7 +19531,7 @@ void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_f32(
 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
@@ -21812,7 +19567,7 @@ void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
   vst4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_p8(
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
@@ -21839,7 +19594,7 @@ void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK-LABEL: @test_vst4_lane_p16(
 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
@@ -21875,140 +19630,136 @@ void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsub_s8(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
   return vsub_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsub_s16(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
   return vsub_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsub_s32(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
   return vsub_s32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsub_s64(
 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
   return vsub_s64(a, b);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vsub_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
 // CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
   return vsub_f32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsub_u8(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
 // CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
   return vsub_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsub_u16(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
 // CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
   return vsub_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsub_u32(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
 // CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
   return vsub_u32(a, b);
 }
 
-// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsub_u64(
 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
 // CHECK:   ret <1 x i64> [[SUB_I]]
 uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
   return vsub_u64(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubq_s8(
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
   return vsubq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubq_s16(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
   return vsubq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubq_s32(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
   return vsubq_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsubq_s64(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
   return vsubq_s64(a, b);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vsubq_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
 // CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
   return vsubq_f32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubq_u8(
 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
 // CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vsubq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubq_u16(
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
 // CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vsubq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubq_u32(
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vsubq_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsubq_u64(
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vsubq_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
@@ -22016,12 +19767,10 @@ int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   return vsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
@@ -22029,12 +19778,10 @@ int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   return vsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
@@ -22042,12 +19789,10 @@ int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   return vsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
@@ -22055,12 +19800,10 @@ uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
@@ -22068,12 +19811,10 @@ uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK-LABEL: @test_vsubhn_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
@@ -22081,8 +19822,7 @@ uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vsubhn_u64(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubl_s8(
 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
@@ -22091,33 +19831,29 @@ int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   return vsubl_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubl_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   return vsubl_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubl_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   return vsubl_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubl_u8(
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
@@ -22126,34 +19862,29 @@ uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   return vsubl_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubl_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   return vsubl_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubl_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   return vsubl_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubw_s8(
 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -22161,27 +19892,25 @@ int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   return vsubw_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubw_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   return vsubw_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubw_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   return vsubw_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vsubw_u8(
 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
 // CHECK:   ret <8 x i16> [[SUB_I]]
@@ -22189,50 +19918,46 @@ uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   return vsubw_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vsubw_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
 // CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   return vsubw_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vsubw_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
 // CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   return vsubw_u32(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl1_u8(
 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VTBL1_I]]
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
   return vtbl1_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl1_s8(
 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VTBL1_I]]
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
   return vtbl1_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl1_p8(
 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
 // CHECK:   ret <8 x i8> [[VTBL1_I]]
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
   return vtbl1_p8(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl2_u8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
@@ -22256,7 +19981,7 @@ uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
   return vtbl2_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl2_s8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
@@ -22280,7 +20005,7 @@ int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
   return vtbl2_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl2_p8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
@@ -22304,8 +20029,7 @@ poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
   return vtbl2_p8(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl3_u8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
@@ -22332,7 +20056,7 @@ uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
   return vtbl3_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl3_s8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
@@ -22359,7 +20083,7 @@ int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
   return vtbl3_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl3_p8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
@@ -22386,8 +20110,7 @@ poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
   return vtbl3_p8(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl4_u8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
@@ -22417,7 +20140,7 @@ uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
   return vtbl4_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl4_s8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
@@ -22447,7 +20170,7 @@ int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
   return vtbl4_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtbl4_p8(
 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
@@ -22477,30 +20200,28 @@ poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
   return vtbl4_p8(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx1_u8(
 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   ret <8 x i8> [[VTBX1_I]]
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vtbx1_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx1_s8(
 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   ret <8 x i8> [[VTBX1_I]]
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vtbx1_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx1_p8(
 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
 // CHECK:   ret <8 x i8> [[VTBX1_I]]
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
   return vtbx1_p8(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx2_u8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
@@ -22524,7 +20245,7 @@ uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
   return vtbx2_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx2_s8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
@@ -22548,7 +20269,7 @@ int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
   return vtbx2_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx2_p8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
@@ -22572,8 +20293,7 @@ poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
   return vtbx2_p8(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx3_u8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
@@ -22600,7 +20320,7 @@ uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
   return vtbx3_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx3_s8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
@@ -22627,7 +20347,7 @@ int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
   return vtbx3_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx3_p8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
@@ -22654,8 +20374,7 @@ poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
   return vtbx3_p8(a, b, c);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx4_u8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
@@ -22685,7 +20404,7 @@ uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
   return vtbx4_u8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx4_s8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
@@ -22715,7 +20434,7 @@ int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
   return vtbx4_s8(a, b, c);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK-LABEL: @test_vtbx4_p8(
 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
@@ -22745,16 +20464,15 @@ poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   return vtbx4_p8(a, b, c);
 }
 
-
-// CHECK-LABEL: define void @test_vtrn_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn_s8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !3
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !3
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -22763,57 +20481,53 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrn_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn_s16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !6
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !6
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrn_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn_s32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !noalias !9
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !9
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrn_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn_u8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !12
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !12
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -22822,78 +20536,72 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrn_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn_u16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !15
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !15
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrn_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrn_u32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !noalias !18
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !18
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrn_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vtrn_f32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]], !noalias !21
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !noalias !21
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrn_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrn_p8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !24
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !24
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -22902,36 +20610,34 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrn_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrn_p16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !27
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !27
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_s8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !30
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !30
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -22940,57 +20646,53 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_s16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !33
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !33
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_s32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !noalias !36
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !36
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_u8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !39
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !39
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -22999,78 +20701,72 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_u16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !42
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !42
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_u32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !noalias !45
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !45
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_f32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]], !noalias !48
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !noalias !48
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_p8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !51
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !51
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -23079,29 +20775,26 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vtrnq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtrnq_p16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !54
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !54
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
 }
 
-
-// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtst_s8(
 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
@@ -23110,33 +20803,29 @@ uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
   return vtst_s8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtst_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
   return vtst_s16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtst_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = and <2 x i32> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
   return vtst_s32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtst_u8(
 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
@@ -23145,33 +20834,29 @@ uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
   return vtst_u8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtst_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
   return vtst_u16(a, b);
 }
 
-// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtst_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = and <2 x i32> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
 // CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
   return vtst_u32(a, b);
 }
 
-// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtst_p8(
 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
@@ -23180,20 +20865,18 @@ uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
   return vtst_p8(a, b);
 }
 
-// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtst_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
 // CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
   return vtst_p16(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtstq_s8(
 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
@@ -23202,33 +20885,29 @@ uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
   return vtstq_s8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtstq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
   return vtstq_s16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtstq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = and <4 x i32> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
   return vtstq_s32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtstq_u8(
 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
@@ -23237,33 +20916,29 @@ uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
   return vtstq_u8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtstq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
   return vtstq_u16(a, b);
 }
 
-// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vtstq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = and <4 x i32> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
 // CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
   return vtstq_u32(a, b);
 }
 
-// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vtstq_p8(
 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
@@ -23272,29 +20947,26 @@ uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
   return vtstq_p8(a, b);
 }
 
-// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vtstq_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
 // CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
   return vtstq_p16(a, b);
 }
 
-
-// CHECK-LABEL: define void @test_vuzp_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp_s8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !57
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !57
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -23303,57 +20975,53 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzp_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp_s16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !60
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !60
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzp_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp_s32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !noalias !63
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !63
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzp_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp_u8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !66
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !66
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -23362,78 +21030,72 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzp_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp_u16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !69
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !69
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzp_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzp_u32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !noalias !72
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !72
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzp_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vuzp_f32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]], !noalias !75
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !noalias !75
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzp_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzp_p8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !78
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !78
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -23442,36 +21104,34 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzp_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzp_p16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !81
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !81
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_s8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !84
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !84
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -23480,57 +21140,53 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_s16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !87
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !87
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_s32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !noalias !90
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !90
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_u8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !93
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !93
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -23539,78 +21195,72 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_u16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !96
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !96
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_u32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !noalias !99
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !99
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_f32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]], !noalias !102
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !noalias !102
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_p8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !105
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !105
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -23619,37 +21269,34 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vuzpq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vuzpq_p16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !108
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !108
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
-
-// CHECK-LABEL: define void @test_vzip_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip_s8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !111
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !111
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -23658,57 +21305,53 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzip_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip_s16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !114
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !114
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzip_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip_s32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !noalias !117
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !117
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzip_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip_u8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !120
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !120
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -23717,78 +21360,72 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzip_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip_u16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !123
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !123
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzip_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzip_u32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !noalias !126
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !126
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzip_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
+// CHECK-LABEL: @test_vzip_f32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]], !noalias !129
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !noalias !129
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzip_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzip_p8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !132
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !132
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
@@ -23797,36 +21434,34 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzip_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzip_p16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !135
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !135
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
 // CHECK:   ret void
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzipq_s8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !138
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !138
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -23835,57 +21470,53 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzipq_s16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !141
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !141
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzipq_s32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !noalias !144
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !144
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzipq_u8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !147
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !147
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -23894,78 +21525,72 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzipq_u16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !150
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !150
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK-LABEL: @test_vzipq_u32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !noalias !153
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !153
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
+// CHECK-LABEL: @test_vzipq_f32(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]], !noalias !156
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !noalias !156
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK-LABEL: @test_vzipq_p8(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !159
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !159
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
@@ -23974,25 +21599,21 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
-// CHECK-LABEL: define void @test_vzipq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK-LABEL: @test_vzipq_p16(
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !162
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !162
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
 // CHECK:   ret void
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
 }
-
-
diff --git a/test/CodeGen/asan-globals.cpp b/test/CodeGen/asan-globals.cpp
index 20c1fa702a86..93abb0023cfa 100644
--- a/test/CodeGen/asan-globals.cpp
+++ b/test/CodeGen/asan-globals.cpp
@@ -1,12 +1,13 @@
 // RUN: echo "int extra_global;" > %t.extra-source.cpp
 // RUN: echo "global:*blacklisted_global*" > %t.blacklist
 // RUN: %clang_cc1 -include %t.extra-source.cpp -fsanitize=address -fsanitize-blacklist=%t.blacklist -emit-llvm -o - %s | FileCheck %s
-// RUN: echo "src:%s" > %t.blacklist-src
+// The blacklist file uses regexps, so Windows path backslashes.
+// RUN: echo "src:%s" | sed -e 's/\\/\\\\/g' > %t.blacklist-src
 // RUN: %clang_cc1 -include %t.extra-source.cpp -fsanitize=address -fsanitize-blacklist=%t.blacklist-src -emit-llvm -o - %s | FileCheck %s --check-prefix=BLACKLIST-SRC
-// REQUIRES: shell
 
 int global;
 int dyn_init_global = global;
+int __attribute__((no_sanitize("address"))) attributed_global;
 int blacklisted_global;
 
 void func() {
@@ -14,24 +15,26 @@ void func() {
   const char *literal = "Hello, world!";
 }
 
-// CHECK: !llvm.asan.globals = !{![[EXTRA_GLOBAL:[0-9]+]], ![[GLOBAL:[0-9]+]], ![[DYN_INIT_GLOBAL:[0-9]+]], ![[BLACKLISTED_GLOBAL:[0-9]+]], ![[STATIC_VAR:[0-9]+]], ![[LITERAL:[0-9]+]]}
+// CHECK: !llvm.asan.globals = !{![[EXTRA_GLOBAL:[0-9]+]], ![[GLOBAL:[0-9]+]], ![[DYN_INIT_GLOBAL:[0-9]+]], ![[ATTR_GLOBAL:[0-9]+]], ![[BLACKLISTED_GLOBAL:[0-9]+]], ![[STATIC_VAR:[0-9]+]], ![[LITERAL:[0-9]+]]}
 // CHECK: ![[EXTRA_GLOBAL]] = !{{{.*}} ![[EXTRA_GLOBAL_LOC:[0-9]+]], !"extra_global", i1 false, i1 false}
 // CHECK: ![[EXTRA_GLOBAL_LOC]] = !{!"{{.*}}extra-source.cpp", i32 1, i32 5}
 // CHECK: ![[GLOBAL]] = !{{{.*}} ![[GLOBAL_LOC:[0-9]+]], !"global", i1 false, i1 false}
 // CHECK: ![[GLOBAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 8, i32 5}
 // CHECK: ![[DYN_INIT_GLOBAL]] = !{{{.*}} ![[DYN_INIT_LOC:[0-9]+]], !"dyn_init_global", i1 true, i1 false}
 // CHECK: ![[DYN_INIT_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 9, i32 5}
+// CHECK: ![[ATTR_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
 // CHECK: ![[BLACKLISTED_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
 // CHECK: ![[STATIC_VAR]] = !{{{.*}} ![[STATIC_LOC:[0-9]+]], !"static_var", i1 false, i1 false}
-// CHECK: ![[STATIC_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 13, i32 14}
+// CHECK: ![[STATIC_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 14, i32 14}
 // CHECK: ![[LITERAL]] = !{{{.*}} ![[LITERAL_LOC:[0-9]+]], !"<string literal>", i1 false, i1 false}
-// CHECK: ![[LITERAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 14, i32 25}
+// CHECK: ![[LITERAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 15, i32 25}
 
-// BLACKLIST-SRC: !llvm.asan.globals = !{![[EXTRA_GLOBAL:[0-9]+]], ![[GLOBAL:[0-9]+]], ![[DYN_INIT_GLOBAL:[0-9]+]], ![[BLACKLISTED_GLOBAL:[0-9]+]], ![[STATIC_VAR:[0-9]+]], ![[LITERAL:[0-9]+]]}
+// BLACKLIST-SRC: !llvm.asan.globals = !{![[EXTRA_GLOBAL:[0-9]+]], ![[GLOBAL:[0-9]+]], ![[DYN_INIT_GLOBAL:[0-9]+]], ![[ATTR_GLOBAL:[0-9]+]], ![[BLACKLISTED_GLOBAL:[0-9]+]], ![[STATIC_VAR:[0-9]+]], ![[LITERAL:[0-9]+]]}
 // BLACKLIST-SRC: ![[EXTRA_GLOBAL]] = !{{{.*}} ![[EXTRA_GLOBAL_LOC:[0-9]+]], !"extra_global", i1 false, i1 false}
 // BLACKLIST-SRC: ![[EXTRA_GLOBAL_LOC]] = !{!"{{.*}}extra-source.cpp", i32 1, i32 5}
 // BLACKLIST-SRC: ![[GLOBAL]] = !{{{.*}} null, null, i1 false, i1 true}
 // BLACKLIST-SRC: ![[DYN_INIT_GLOBAL]] = !{{{.*}} null, null, i1 true, i1 true}
+// BLACKLIST-SRC: ![[ATTR_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
 // BLACKLIST-SRC: ![[BLACKLISTED_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
 // BLACKLIST-SRC: ![[STATIC_VAR]] = !{{{.*}} null, null, i1 false, i1 true}
 // BLACKLIST-SRC: ![[LITERAL]] = !{{{.*}} null, null, i1 false, i1 true}
diff --git a/test/CodeGen/atomics-inlining.c b/test/CodeGen/atomics-inlining.c
index 4974f22e3a47..fc23c0b3f64f 100644
--- a/test/CodeGen/atomics-inlining.c
+++ b/test/CodeGen/atomics-inlining.c
@@ -3,7 +3,8 @@
 // RUN: %clang_cc1 -triple powerpc64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=PPC64
 // RUN: %clang_cc1 -triple mipsel-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS32
 // RUN: %clang_cc1 -triple mips64el-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS64
-// RUN: %clang_cc1 -triple sparc-unknown-eabi -emit-llvm %s -o - | FileCheck %s -check-prefix=SPARC
+// RUN: %clang_cc1 -triple sparc-unknown-eabi -emit-llvm %s -o - | FileCheck %s -check-prefix=SPARCV8 -check-prefix=SPARC
+// RUN: %clang_cc1 -triple sparcv9-unknown-eabi -emit-llvm %s -o - | FileCheck %s -check-prefix=SPARCV9 -check-prefix=SPARC
 
 unsigned char c1, c2;
 unsigned short s1, s2;
@@ -99,8 +100,10 @@ void test1(void) {
 // SPARC: store atomic i16 {{.*}}, i16* @s1 seq_cst
 // SPARC: = load atomic i32, i32* @i1 seq_cst
 // SPARC: store atomic i32 {{.*}}, i32* @i1 seq_cst
-// SPARC: = load atomic i64, i64* @ll1 seq_cst
-// SPARC: store atomic i64 {{.*}}, i64* @ll1 seq_cst
-// SPARC: call void @__atomic_load(i32 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
-// SPARC: call void @__atomic_store(i32 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
+// SPARCV8: call i64 @__atomic_load_8(i8* bitcast (i64* @ll1 to i8*)
+// SPARCV8: call void @__atomic_store_8(i8* bitcast (i64* @ll1 to i8*), i64
+// SPARCV9: load atomic i64, i64* @ll1 seq_cst, align 8
+// SPARCV9: store atomic i64 {{.*}}, i64* @ll1 seq_cst, align 8
+// SPARCV8: call void @__atomic_load(i32 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
+// SPARCV8: call void @__atomic_store(i32 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
 }
diff --git a/test/CodeGen/attr-minsize.cpp b/test/CodeGen/attr-minsize.cpp
index 1e5c634dbdfc..4a26e856ff9e 100644
--- a/test/CodeGen/attr-minsize.cpp
+++ b/test/CodeGen/attr-minsize.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -Oz -disable-llvm-optzns -emit-llvm %s -o - | FileCheck %s -check-prefix=Oz
-// RUN: %clang_cc1     -disable-llvm-optzns -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
-// RUN: %clang_cc1 -O1 -disable-llvm-optzns -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
-// RUN: %clang_cc1 -O2 -disable-llvm-optzns -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
-// RUN: %clang_cc1 -O3 -disable-llvm-optzns -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
-// RUN: %clang_cc1 -Os -disable-llvm-optzns -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
+// RUN: %clang_cc1 -Oz -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s -check-prefix=Oz
+// RUN: %clang_cc1     -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
+// RUN: %clang_cc1 -O1 -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
+// RUN: %clang_cc1 -O2 -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
+// RUN: %clang_cc1 -O3 -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
+// RUN: %clang_cc1 -Os -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s -check-prefix=OTHER
 // Check that we set the minsize attribute on each function
 // when Oz optimization level is set.
 
@@ -76,4 +76,4 @@ void test5<float>(float arg);
 
 // Oz: attributes [[MINSIZE]] = { minsize{{.*}} }
 
-// OTHER: attributes [[MS]] = { minsize nounwind{{.*}} }
+// OTHER: attributes [[MS]] = { minsize{{.*}} }
diff --git a/test/CodeGen/attr-mrecip.c b/test/CodeGen/attr-mrecip.c
new file mode 100644
index 000000000000..954c57ae102b
--- /dev/null
+++ b/test/CodeGen/attr-mrecip.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -mrecip=!sqrtf,vec-divf:3 -emit-llvm %s -o - | FileCheck %s
+
+int baz(int a) { return 4; }
+
+// CHECK: baz{{.*}} #0
+// CHECK: #0 = {{.*}}"reciprocal-estimates"="!sqrtf,vec-divf:3"
+
diff --git a/test/CodeGen/attributes.c b/test/CodeGen/attributes.c
index 7bfc3924fffc..9e5fa9f90651 100644
--- a/test/CodeGen/attributes.c
+++ b/test/CodeGen/attributes.c
@@ -90,5 +90,5 @@ void __attribute__((section(".bar"))) t22(void) {}
 
 // CHECK: define void @t22() [[NUW]] section ".bar"
 
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
-// CHECK: attributes [[NR]] = { noreturn nounwind{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
+// CHECK: attributes [[NR]] = { noinline noreturn nounwind{{.*}} }
diff --git a/test/CodeGen/avx-builtins.c b/test/CodeGen/avx-builtins.c
index 650e4d280ecb..6a9f7c050b32 100644
--- a/test/CodeGen/avx-builtins.c
+++ b/test/CodeGen/avx-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
@@ -84,13 +82,15 @@ __m256 test_mm256_blendv_ps(__m256 V1, __m256 V2, __m256 V3) {
 
 __m256d test_mm256_broadcast_pd(__m128d* A) {
   // CHECK-LABEL: test_mm256_broadcast_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %{{.*}})
+  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcast_pd(A);
 }
 
 __m256 test_mm256_broadcast_ps(__m128* A) {
   // CHECK-LABEL: test_mm256_broadcast_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %{{.*}})
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   return _mm256_broadcast_ps(A);
 }
 
diff --git a/test/CodeGen/avx-cmp-builtins.c b/test/CodeGen/avx-cmp-builtins.c
index 30d1bd5c7aa0..7d619426653d 100644
--- a/test/CodeGen/avx-cmp-builtins.c
+++ b/test/CodeGen/avx-cmp-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
 // FIXME: The shufflevector instructions in test_cmpgt_sd are relying on O3 here.
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/avx-shuffle-builtins.c b/test/CodeGen/avx-shuffle-builtins.c
index 22bee33080a9..b5bac89bd1ec 100644
--- a/test/CodeGen/avx-shuffle-builtins.c
+++ b/test/CodeGen/avx-shuffle-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
 // FIXME: This is testing optimized generation of shuffle instructions and should be fixed.
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
@@ -69,9 +67,7 @@ __m128
 test_mm_broadcast_ss(float const *__a) {
   // CHECK-LABEL: @test_mm_broadcast_ss
   // CHECK: insertelement <4 x float> {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> {{.*}}, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, i32 3
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer
   return _mm_broadcast_ss(__a);
 }
 
@@ -79,9 +75,7 @@ __m256d
 test_mm256_broadcast_sd(double const *__a) {
   // CHECK-LABEL: @test_mm256_broadcast_sd
   // CHECK: insertelement <4 x double> {{.*}}, i32 0
-  // CHECK: insertelement <4 x double> {{.*}}, i32 1
-  // CHECK: insertelement <4 x double> {{.*}}, i32 2
-  // CHECK: insertelement <4 x double> {{.*}}, i32 3
+  // CHECK: shufflevector <4 x double> {{.*}}, <4 x double> undef, <4 x i32> zeroinitializer
   return _mm256_broadcast_sd(__a);
 }
 
@@ -89,13 +83,7 @@ __m256
 test_mm256_broadcast_ss(float const *__a) {
   // CHECK-LABEL: @test_mm256_broadcast_ss
   // CHECK: insertelement <8 x float> {{.*}}, i32 0
-  // CHECK: insertelement <8 x float> {{.*}}, i32 1
-  // CHECK: insertelement <8 x float> {{.*}}, i32 2
-  // CHECK: insertelement <8 x float> {{.*}}, i32 3
-  // CHECK: insertelement <8 x float> {{.*}}, i32 4
-  // CHECK: insertelement <8 x float> {{.*}}, i32 5
-  // CHECK: insertelement <8 x float> {{.*}}, i32 6
-  // CHECK: insertelement <8 x float> {{.*}}, i32 7
+  // CHECK: shufflevector <8 x float> {{.*}}, <8 x float> undef, <8 x i32> zeroinitializer
   return _mm256_broadcast_ss(__a);
 }
 
diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c
index 1e7124537331..4ccf6e6da97b 100644
--- a/test/CodeGen/avx2-builtins.c
+++ b/test/CodeGen/avx2-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/avx512-inline-asm-kregisters-basics.c b/test/CodeGen/avx512-inline-asm-kregisters-basics.c
new file mode 100644
index 000000000000..8c0cb9f53456
--- /dev/null
+++ b/test/CodeGen/avx512-inline-asm-kregisters-basics.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// This test checks basic inline assembly recognition of k0-k7 registers for avx512.
+
+void test_basic_inline_asm_with_k_regs() {
+    //CHECK: kandw %k1, %k2, %k3
+    asm("kandw %k1, %k2, %k3\t");
+    //CHECK: kandw %k4, %k5, %k6
+    asm("kandw %k4, %k5, %k6\t");
+    //CHECK: kandw %k7, %k0, %k1
+    asm("kandw %k7, %k0, %k1\t");
+}
+\ No newline at end of file
diff --git a/test/CodeGen/avx512-kconstraints-att_inline_asm.c b/test/CodeGen/avx512-kconstraints-att_inline_asm.c
new file mode 100644
index 000000000000..df93ddf7bbb9
--- /dev/null
+++ b/test/CodeGen/avx512-kconstraints-att_inline_asm.c
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror |opt -instnamer -S |FileCheck %s
+// This test checks validity of att\gcc style inline assmebly for avx512 k and Yk constraints.
+// Also checks mask register allows flexible type (size <= 64 bit)
+
+void mask_Yk_i8(char msk){ 
+//CHECK: vpaddb\09 %xmm1, %xmm0, %xmm1 {$0}\09
+ asm ("vpaddb\t %%xmm1, %%xmm0, %%xmm1 %{%0%}\t"
+       :                //output
+       : "Yk" (msk));   //inputs
+}
+
+void mask_Yk_i16(short msk){
+//CHECK: vpaddb\09 %xmm1, %xmm0, %xmm1 {$0}\09
+ asm ("vpaddb\t %%xmm1, %%xmm0, %%xmm1 %{%0%}\t"
+       :                //output
+       :  "Yk" (msk));  //inputs
+}
+
+void mask_Yk_i32(int msk){
+//CHECK: vpaddb\09 %xmm1, %xmm0, %xmm1 {$0}\09
+    asm ("vpaddb\t %%xmm1, %%xmm0, %%xmm1 %{%0%}\t"
+       :                //output
+       :  "Yk" (msk));   //inputs
+}
+
+void mask_Yk_i64(long long msk){
+//CHECK: vpaddb\09 %xmm1, %xmm0, %xmm1 {$0}\09
+ asm ("vpaddb\t %%xmm1, %%xmm0, %%xmm1 %{%0%}\t"
+       :                //output
+       :  "Yk" (msk));   //inputs
+}
+
+void k_wise_op_i8(char msk_dst,char msk_src1,char msk_src2){
+//CHECK: kandw\09$2, $1, $0
+ asm ("kandw\t%2, %1, %0"
+       : "=k" (msk_dst)
+       : "k" (msk_src1), "k" (msk_src2));
+}
+
+void k_wise_op_i16(short msk_dst, short msk_src1, short msk_src2){
+//CHECK: kandw\09$2, $1, $0
+  asm ("kandw\t%2, %1, %0"
+       : "=k" (msk_dst)
+       : "k" (msk_src1), "k" (msk_src2));
+}
+
+void k_wise_op_i32(int msk_dst, int msk_src1, int msk_src2){
+//CHECK: kandw\09$2, $1, $0
+  asm ("kandw\t%2, %1, %0"
+       : "=k" (msk_dst)
+       : "k" (msk_src1), "k" (msk_src2));
+}
+
+void k_wise_op_i64(long long msk_dst, long long msk_src1, long long msk_src2){
+//CHECK: kandw\09$2, $1, $0
+  asm ("kandw\t%2, %1, %0"
+       : "=k" (msk_dst)
+       : "k" (msk_src1), "k" (msk_src2));
+}
diff --git a/test/CodeGen/avx512-reduceIntrin.c b/test/CodeGen/avx512-reduceIntrin.c
new file mode 100644
index 000000000000..d24cd0e5634d
--- /dev/null
+++ b/test/CodeGen/avx512-reduceIntrin.c
@@ -0,0 +1,410 @@
+// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+long long test_mm512_reduce_add_epi64(__m512i __W){
+  // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %add7.i = add <2 x i64> %shuffle6.i, %add4.i
+  // CHECK: %vecext.i = extractelement <2 x i64> %add7.i, i32 0
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_add_epi64(__W);
+}
+
+long long test_mm512_reduce_mul_epi64(__m512i __W){
+  // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
+  // CHECK: %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_mul_epi64(__W); 
+}
+
+long long test_mm512_reduce_or_epi64(__m512i __W){
+  // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i 
+  // CHECK: %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %or7.i = or <2 x i64> %shuffle6.i, %or4.i
+  // CHECK: %vecext.i = extractelement <2 x i64> %or7.i, i32 0
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_or_epi64(__W); 
+}
+
+long long test_mm512_reduce_and_epi64(__m512i __W){
+  // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %and7.i = and <2 x i64> %shuffle6.i, %and4.i
+  // CHECK: %vecext.i = extractelement <2 x i64> %and7.i, i32 0
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_and_epi64(__W);
+}
+
+long long test_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W){
+  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
+  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> zeroinitializer
+  // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %add.i = add <4 x i64> %shuffle.i, %shuffle1.i 
+  // CHECK: %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %add7.i = add <2 x i64> %shuffle6.i, %add4.i
+  // CHECK: %vecext.i = extractelement <2 x i64> %add7.i, i32 0
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_add_epi64(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){
+  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
+  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
+  // CHECK: %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_mul_epi64(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){
+  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
+  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %and7.i = and <2 x i64> %shuffle6.i, %and4.i
+  // CHECK: %vecext.i = extractelement <2 x i64> %and7.i, i32 0
+  return _mm512_mask_reduce_and_epi64(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){
+  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
+  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> zeroinitializer
+  // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %or7.i = or <2 x i64> %shuffle6.i, %or4.i
+  // CHECK: %vecext.i = extractelement <2 x i64> %or7.i, i32 0
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_or_epi64(__M, __W); 
+}
+
+int test_mm512_reduce_add_epi32(__m512i __W){
+  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %add.i = add <8 x i32> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %add7.i = add <4 x i32> %shuffle6.i, %add4.i
+  // CHECK: %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %add10.i = add <4 x i32> %shuffle9.i, %add7.i
+  // CHECK: {{.*}} = bitcast <4 x i32> %add10.i to <2 x i64>
+  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_add_epi32(__W); 
+}
+
+int test_mm512_reduce_mul_epi32(__m512i __W){
+  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i
+  // CHECK: %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i
+  // CHECK: {{.*}} = bitcast <4 x i32> %mul10.i to <2 x i64>
+  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_mul_epi32(__W); 
+}
+
+int test_mm512_reduce_or_epi32(__m512i __W){
+  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %or.i = or <8 x i32> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %or7.i = or <4 x i32> %shuffle6.i, %or4.i
+  // CHECK: %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %or10.i = or <4 x i32> %shuffle9.i, %or7.i
+  // CHECK: {{.*}} = bitcast <4 x i32> %or10.i to <2 x i64>
+  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_or_epi32(__W); 
+}
+
+int test_mm512_reduce_and_epi32(__m512i __W){
+  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %and.i = and <8 x i32> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %and7.i = and <4 x i32> %shuffle6.i, %and4.i
+  // CHECK: %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %and10.i = and <4 x i32> %shuffle9.i, %and7.i
+  // CHECK: {{.*}} = bitcast <4 x i32> %and10.i to <2 x i64>
+  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_and_epi32(__W); 
+}
+
+int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){
+  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
+  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> zeroinitializer
+  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %add.i = add <8 x i32> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %add7.i = add <4 x i32> %shuffle6.i, %add4.i
+  // CHECK: %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %add10.i = add <4 x i32> %shuffle9.i, %add7.i
+  // CHECK: {{.*}} = bitcast <4 x i32> %add10.i to <2 x i64>
+  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_add_epi32(__M, __W); 
+}
+
+int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){
+  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
+  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i
+  // CHECK: %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i
+  // CHECK: {{.*}} = bitcast <4 x i32> %mul10.i to <2 x i64>
+  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_mul_epi32(__M, __W); 
+}
+
+int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){
+  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
+  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %and.i = and <8 x i32> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %and7.i = and <4 x i32> %shuffle6.i, %and4.i
+  // CHECK: %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %and10.i = and <4 x i32> %shuffle9.i, %and7.i
+  // CHECK: {{.*}} = bitcast <4 x i32> %and10.i to <2 x i64>
+  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_and_epi32(__M, __W); 
+}
+
+int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){
+  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
+  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> zeroinitializer
+  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %or.i = or <8 x i32> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %or7.i = or <4 x i32> %shuffle6.i, %or4.i
+  // CHECK: %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %or10.i = or <4 x i32> %shuffle9.i, %or7.i
+  // CHECK: {{.*}} = bitcast <4 x i32> %or10.i to <2 x i64>
+  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_or_epi32(__M, __W); 
+}
+
+double test_mm512_reduce_add_pd(__m512d __W){
+  // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
+  // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_reduce_add_pd(__W); 
+}
+
+double test_mm512_reduce_mul_pd(__m512d __W){
+  // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
+  // CHECK: %vecext.i = extractelement <2 x double> %mul7.i, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_reduce_mul_pd(__W); 
+}
+
+float test_mm512_reduce_add_ps(__m512 __W){
+  // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i
+  // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_reduce_add_ps(__W); 
+}
+
+float test_mm512_reduce_mul_ps(__m512 __W){
+  // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i
+  // CHECK: %vecext.i = extractelement <4 x float> %mul10.i, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_reduce_mul_ps(__W); 
+}
+
+double test_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W){
+  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
+  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x double> %__W, <8 x double> zeroinitializer
+  // CHECK: %shuffle.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
+  // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_mask_reduce_add_pd(__M, __W); 
+}
+
+double test_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W){
+  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
+  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  // CHECK: %shuffle.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle1.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  // CHECK: %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  // CHECK: %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
+  // CHECK: %vecext.i = extractelement <2 x double> %mul7.i, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_mask_reduce_mul_pd(__M, __W); 
+}
+
+float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){
+  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
+  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x float> %__W, <16 x float> zeroinitializer
+  // CHECK: %shuffle.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i
+  // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_mask_reduce_add_ps(__M, __W); 
+}
+
+float test_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W){
+  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
+  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  // CHECK: %shuffle.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %shuffle1.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i
+  // CHECK: %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // CHECK: %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // CHECK: %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i
+  // CHECK: %vecext.i = extractelement <4 x float> %mul10.i, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_mask_reduce_mul_ps(__M, __W); 
+}
diff --git a/test/CodeGen/avx512-reduceMinMaxIntrin.c b/test/CodeGen/avx512-reduceMinMaxIntrin.c
new file mode 100644
index 000000000000..8249b229c8f5
--- /dev/null
+++ b/test/CodeGen/avx512-reduceMinMaxIntrin.c
@@ -0,0 +1,437 @@
+// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror |opt -instnamer -S |FileCheck %s
+
+#include <immintrin.h>
+
+long long test_mm512_reduce_max_epi64(__m512i __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
+  // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
+  // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_max_epi64(__W);
+}
+
+unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
+  // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
+  // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_max_epu64(__W); 
+}
+
+double test_mm512_reduce_max_pd(__m512d __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_reduce_max_pd(__W); 
+}
+
+long long test_mm512_reduce_min_epi64(__m512i __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
+  // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
+  // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_max_epi64(__W);
+}
+
+unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
+  // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
+  // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_max_epu64(__W); 
+}
+
+double test_mm512_reduce_min_pd(__m512d __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_reduce_min_pd(__W); 
+}
+
+long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle1.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle4.i
+  // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp6 = icmp sgt <8 x i64> %tmp5, %shuffle7.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
+  // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
+  // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_max_epi64(__M, __W); 
+}
+
+unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i
+  // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
+  // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
+  // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_max_epu64(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0
+  // CHECK: %conv = fptosi double %vecext.i to i64
+  // CHECK: ret i64 %conv
+  return _mm512_mask_reduce_max_pd(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp slt <8 x i64> %tmp1, %shuffle1.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp slt <8 x i64> %tmp3, %shuffle4.i
+  // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp6 = icmp slt <8 x i64> %tmp5, %shuffle7.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
+  // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
+  // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_min_epi64(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i
+  // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
+  // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
+  // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_max_epu64(__M, __W); 
+}
+
+double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_mask_reduce_min_pd(__M, __W); 
+}
+
+int test_mm512_reduce_max_epi32(__m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = icmp sgt <16 x i32> %tmp, %shuffle1.i
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle3.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle6.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle9.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
+  // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_max_epi32(__W);
+}
+
+unsigned int test_mm512_reduce_max_epu32(__m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = icmp ugt <16 x i32> %tmp, %shuffle1.i
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle3.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle6.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle9.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
+  // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_max_epu32(__W); 
+}
+
+float test_mm512_reduce_max_ps(__m512 __W){
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_reduce_max_ps(__W); 
+}
+
+int test_mm512_reduce_min_epi32(__m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = icmp slt <16 x i32> %tmp, %shuffle1.i
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle3.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle6.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle9.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
+  // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_min_epi32(__W);
+}
+
+unsigned int test_mm512_reduce_min_epu32(__m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = icmp ult <16 x i32> %tmp, %shuffle1.i
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle3.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle6.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle9.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
+  // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_min_epu32(__W); 
+}
+
+float test_mm512_reduce_min_ps(__m512 __W){
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_reduce_min_ps(__W); 
+}
+
+int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle1.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle4.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle7.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
+  // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp9 = icmp sgt <16 x i32> %tmp8, %shuffle10.i
+  // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
+  // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_max_epi32(__M, __W); 
+}
+
+unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> zeroinitializer
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle1.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle4.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle7.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
+  // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp9 = icmp ugt <16 x i32> %tmp8, %shuffle10.i
+  // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
+  // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_max_epu32(__M, __W); 
+}
+
+float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
+  // CHECK: %tmp = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_mask_reduce_max_ps(__M, __W); 
+}
+
+int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle1.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle4.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle7.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
+  // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp9 = icmp slt <16 x i32> %tmp8, %shuffle10.i
+  // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
+  // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_min_epi32(__M, __W); 
+}
+
+unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle1.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle4.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle7.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
+  // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp9 = icmp ult <16 x i32> %tmp8, %shuffle10.i
+  // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
+  // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_min_epu32(__M, __W); 
+}
+
+float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
+  // CHECK: %tmp = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_mask_reduce_min_ps(__M, __W); 
+}
+
diff --git a/test/CodeGen/avx512bw-builtins.c b/test/CodeGen/avx512bw-builtins.c
index 1cd0a0ccb141..b4dfb5ccb9ec 100644
--- a/test/CodeGen/avx512bw-builtins.c
+++ b/test/CodeGen/avx512bw-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
@@ -378,13 +376,15 @@ __m512i test_mm512_add_epi8 (__m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mask_add_epi8
-  //CHECK: @llvm.x86.avx512.mask.padd.b.512
+  //CHECK: add <64 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_add_epi8(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_add_epi8
-  //CHECK: @llvm.x86.avx512.mask.padd.b.512
+  //CHECK: add <64 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_add_epi8(__U, __A, __B);
 }
 
@@ -396,13 +396,15 @@ __m512i test_mm512_sub_epi8 (__m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mask_sub_epi8
-  //CHECK: @llvm.x86.avx512.mask.psub.b.512
+  //CHECK: sub <64 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_sub_epi8(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_sub_epi8
-  //CHECK: @llvm.x86.avx512.mask.psub.b.512
+  //CHECK: sub <64 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_sub_epi8(__U, __A, __B);
 }
 
@@ -414,13 +416,15 @@ __m512i test_mm512_add_epi16 (__m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mask_add_epi16
-  //CHECK: @llvm.x86.avx512.mask.padd.w.512
+  //CHECK: add <32 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_add_epi16(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_add_epi16
-  //CHECK: @llvm.x86.avx512.mask.padd.w.512
+  //CHECK: add <32 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_add_epi16(__U, __A, __B);
 }
 
@@ -432,13 +436,15 @@ __m512i test_mm512_sub_epi16 (__m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mask_sub_epi16
-  //CHECK: @llvm.x86.avx512.mask.psub.w.512
+  //CHECK: sub <32 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_sub_epi16(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_sub_epi16
-  //CHECK: @llvm.x86.avx512.mask.psub.w.512
+  //CHECK: sub <32 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_sub_epi16(__U, __A, __B);
 }
 
@@ -450,13 +456,15 @@ __m512i test_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mask_mullo_epi16
-  //CHECK: @llvm.x86.avx512.mask.pmull.w.512
+  //CHECK: mul <32 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mullo_epi16(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_mullo_epi16
-  //CHECK: @llvm.x86.avx512.mask.pmull.w.512
+  //CHECK: mul <32 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mullo_epi16(__U, __A, __B);
 }
 
@@ -652,137 +660,179 @@ __m512i test_mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
 }
 __m512i test_mm512_max_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_max_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.b.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
   return _mm512_max_epi8(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_max_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.b.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_max_epi8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_max_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.b.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_max_epi8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_max_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.w.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
   return _mm512_max_epi16(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_max_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.w.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_max_epi16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_max_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.w.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_max_epi16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_max_epu8
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.b.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
   return _mm512_max_epu8(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epu8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_max_epu8
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.b.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_max_epu8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_max_epu8
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.b.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_max_epu8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_max_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.w.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
   return _mm512_max_epu16(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epu16(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_max_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.w.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_max_epu16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_max_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.w.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_max_epu16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_min_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmins.b.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
   return _mm512_min_epi8(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_min_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmins.b.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_min_epi8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_min_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmins.b.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_min_epi8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_min_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmins.w.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
   return _mm512_min_epi16(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_min_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmins.w.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_min_epi16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_min_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmins.w.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_min_epi16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_min_epu8
-  // CHECK: @llvm.x86.avx512.mask.pminu.b.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
   return _mm512_min_epu8(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epu8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_min_epu8
-  // CHECK: @llvm.x86.avx512.mask.pminu.b.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_min_epu8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_min_epu8
-  // CHECK: @llvm.x86.avx512.mask.pminu.b.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <64 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_min_epu8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_min_epu16
-  // CHECK: @llvm.x86.avx512.mask.pminu.w.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
   return _mm512_min_epu16(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epu16(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_min_epu16
-  // CHECK: @llvm.x86.avx512.mask.pminu.w.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_min_epu16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_min_epu16
-  // CHECK: @llvm.x86.avx512.mask.pminu.w.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_min_epu16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shuffle_epi8
-  // CHECK: @llvm.x86.avx512.mask.pshuf.b.512
+  // CHECK: @llvm.x86.avx512.pshuf.b.512
   return _mm512_shuffle_epi8(__A,__B); 
 }
 __m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shuffle_epi8
-  // CHECK: @llvm.x86.avx512.mask.pshuf.b.512
+  // CHECK: @llvm.x86.avx512.pshuf.b.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shuffle_epi8
-  // CHECK: @llvm.x86.avx512.mask.pshuf.b.512
+  // CHECK: @llvm.x86.avx512.pshuf.b.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_shuffle_epi8(__U,__A,__B); 
 }
 __m512i test_mm512_subs_epi8(__m512i __A, __m512i __B) {
@@ -1079,37 +1129,41 @@ __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 
 __m512i test_mm512_cvtepi8_epi16(__m256i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  // CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
   return _mm512_cvtepi8_epi16(__A); 
 }
 
 __m512i test_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  // CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_cvtepi8_epi16(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  // CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_cvtepi8_epi16(__U, __A); 
 }
 
 __m512i test_mm512_cvtepu8_epi16(__m256i __A) {
   // CHECK-LABEL: @test_mm512_cvtepu8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  // CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
   return _mm512_cvtepu8_epi16(__A); 
 }
 
 __m512i test_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  // CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_cvtepu8_epi16(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  // CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_cvtepu8_epi16(__U, __A); 
 }
 
@@ -1155,55 +1209,61 @@ __m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) {
 
 __m512i test_mm512_sllv_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.512(
   return _mm512_sllv_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_sllv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.512(
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_sllv_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.512(
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_sllv_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_sll_epi16(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_sll_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  // CHECK: @llvm.x86.avx512.psll.w.512
   return _mm512_sll_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_sll_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  // CHECK: @llvm.x86.avx512.psll.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_sll_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_sll_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  // CHECK: @llvm.x86.avx512.psll.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_sll_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_slli_epi16(__m512i __A) {
   // CHECK-LABEL: @test_mm512_slli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  // CHECK: @llvm.x86.avx512.pslli.w.512
   return _mm512_slli_epi16(__A, 5); 
 }
 
 __m512i test_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_slli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  // CHECK: @llvm.x86.avx512.pslli.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_slli_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_slli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  // CHECK: @llvm.x86.avx512.pslli.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_slli_epi16(__U, __A, 5); 
 }
 
@@ -1215,109 +1275,121 @@ __m512i test_mm512_bslli_epi128(__m512i __A) {
 
 __m512i test_mm512_srlv_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_srlv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx512.psrlv.w.512(
   return _mm512_srlv_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_srlv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx512.psrlv.w.512(
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srlv_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_srlv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx512.psrlv.w.512(
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srlv_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_srav_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_srav_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrav
+  // CHECK: @llvm.x86.avx512.psrav.w.512(
   return _mm512_srav_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_srav_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrav
+  // CHECK: @llvm.x86.avx512.psrav.w.512(
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srav_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_srav_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrav
+  // CHECK: @llvm.x86.avx512.psrav.w.512(
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srav_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_sra_epi16(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_sra_epi16
-  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  // CHECK: @llvm.x86.avx512.psra.w.512
   return _mm512_sra_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_sra_epi16
-  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  // CHECK: @llvm.x86.avx512.psra.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_sra_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_sra_epi16
-  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  // CHECK: @llvm.x86.avx512.psra.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_sra_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_srai_epi16(__m512i __A) {
   // CHECK-LABEL: @test_mm512_srai_epi16
-  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  // CHECK: @llvm.x86.avx512.psrai.w.512
   return _mm512_srai_epi16(__A, 5); 
 }
 
 __m512i test_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_srai_epi16
-  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  // CHECK: @llvm.x86.avx512.psrai.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srai_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_srai_epi16
-  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  // CHECK: @llvm.x86.avx512.psrai.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srai_epi16(__U, __A, 5); 
 }
 
 __m512i test_mm512_srl_epi16(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_srl_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  // CHECK: @llvm.x86.avx512.psrl.w.512
   return _mm512_srl_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_srl_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  // CHECK: @llvm.x86.avx512.psrl.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srl_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_srl_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  // CHECK: @llvm.x86.avx512.psrl.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srl_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_srli_epi16(__m512i __A) {
   // CHECK-LABEL: @test_mm512_srli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  // CHECK: @llvm.x86.avx512.psrli.w.512
   return _mm512_srli_epi16(__A, 5); 
 }
 
 __m512i test_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_srli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  // CHECK: @llvm.x86.avx512.psrli.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srli_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_srli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  // CHECK: @llvm.x86.avx512.psrli.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srli_epi16(__U, __A, 5); 
 }
 
diff --git a/test/CodeGen/avx512cdintrin.c b/test/CodeGen/avx512cdintrin.c
index 415a82c2c191..a28601895be1 100644
--- a/test/CodeGen/avx512cdintrin.c
+++ b/test/CodeGen/avx512cdintrin.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512cd -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512cd -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/avx512dq-builtins.c b/test/CodeGen/avx512dq-builtins.c
index 59a7cad7e3d1..f57433a3616b 100644
--- a/test/CodeGen/avx512dq-builtins.c
+++ b/test/CodeGen/avx512dq-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
@@ -13,13 +11,15 @@ __m512i test_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_mullo_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmull.q.512
+  // CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return (__m512i) _mm512_mask_mullo_epi64(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_mullo_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmull.q.512
+  // CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return (__m512i) _mm512_maskz_mullo_epi64(__U, __A, __B);
 }
 
@@ -31,13 +31,17 @@ __m512d test_mm512_xor_pd (__m512d __A, __m512d __B) {
 
 __m512d test_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_xor_pd
-  // CHECK: @llvm.x86.avx512.mask.xor.pd.512
+  // CHECK: xor <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return (__m512d) _mm512_mask_xor_pd(__W, __U, __A, __B);
 }
 
 __m512d test_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_xor_pd
-  // CHECK: @llvm.x86.avx512.mask.xor.pd.512
+  // CHECK: xor <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return (__m512d) _mm512_maskz_xor_pd(__U, __A, __B);
 }
 
@@ -49,13 +53,17 @@ __m512 test_mm512_xor_ps (__m512 __A, __m512 __B) {
 
 __m512 test_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_xor_ps
-  // CHECK: @llvm.x86.avx512.mask.xor.ps.512
+  // CHECK: xor <16 x i32>
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return (__m512) _mm512_mask_xor_ps(__W, __U, __A, __B);
 }
 
 __m512 test_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_xor_ps
-  // CHECK: @llvm.x86.avx512.mask.xor.ps.512
+  // CHECK: xor <16 x i32>
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return (__m512) _mm512_maskz_xor_ps(__U, __A, __B);
 }
 
@@ -67,13 +75,17 @@ __m512d test_mm512_or_pd (__m512d __A, __m512d __B) {
 
 __m512d test_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_or_pd
-  // CHECK: @llvm.x86.avx512.mask.or.pd.512
+  // CHECK: or <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return (__m512d) _mm512_mask_or_pd(__W, __U, __A, __B);
 }
 
 __m512d test_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_or_pd
-  // CHECK: @llvm.x86.avx512.mask.or.pd.512
+  // CHECK: or <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return (__m512d) _mm512_maskz_or_pd(__U, __A, __B);
 }
 
@@ -85,13 +97,17 @@ __m512 test_mm512_or_ps (__m512 __A, __m512 __B) {
 
 __m512 test_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_or_ps
-  // CHECK: @llvm.x86.avx512.mask.or.ps.512
+  // CHECK: or <16 x i32>
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return (__m512) _mm512_mask_or_ps(__W, __U, __A, __B);
 }
 
 __m512 test_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_or_ps
-  // CHECK: @llvm.x86.avx512.mask.or.ps.512
+  // CHECK: or <16 x i32>
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return (__m512) _mm512_maskz_or_ps(__U, __A, __B);
 }
 
@@ -103,13 +119,17 @@ __m512d test_mm512_and_pd (__m512d __A, __m512d __B) {
 
 __m512d test_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_and_pd
-  // CHECK: @llvm.x86.avx512.mask.and.pd.512
+  // CHECK: and <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return (__m512d) _mm512_mask_and_pd(__W, __U, __A, __B);
 }
 
 __m512d test_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_and_pd
-  // CHECK: @llvm.x86.avx512.mask.and.pd.512
+  // CHECK: and <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return (__m512d) _mm512_maskz_and_pd(__U, __A, __B);
 }
 
@@ -121,49 +141,63 @@ __m512 test_mm512_and_ps (__m512 __A, __m512 __B) {
 
 __m512 test_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_and_ps
-  // CHECK: @llvm.x86.avx512.mask.and.ps.512
+  // CHECK: and <16 x i32>
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return (__m512) _mm512_mask_and_ps(__W, __U, __A, __B);
 }
 
 __m512 test_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_and_ps
-  // CHECK: @llvm.x86.avx512.mask.and.ps.512
+  // CHECK: and <16 x i32>
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return (__m512) _mm512_maskz_and_ps(__U, __A, __B);
 }
 
 __m512d test_mm512_andnot_pd (__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_andnot_pd
-  // CHECK: @llvm.x86.avx512.mask.andn.pd.512
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64>
   return (__m512d) _mm512_andnot_pd(__A, __B);
 }
 
 __m512d test_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_andnot_pd
-  // CHECK: @llvm.x86.avx512.mask.andn.pd.512
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return (__m512d) _mm512_mask_andnot_pd(__W, __U, __A, __B);
 }
 
 __m512d test_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_andnot_pd
-  // CHECK: @llvm.x86.avx512.mask.andn.pd.512
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return (__m512d) _mm512_maskz_andnot_pd(__U, __A, __B);
 }
 
 __m512 test_mm512_andnot_ps (__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_andnot_ps
-  // CHECK: @llvm.x86.avx512.mask.andn.ps.512
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32>
   return (__m512) _mm512_andnot_ps(__A, __B);
 }
 
 __m512 test_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_andnot_ps
-  // CHECK: @llvm.x86.avx512.mask.andn.ps.512
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return (__m512) _mm512_mask_andnot_ps(__W, __U, __A, __B);
 }
 
 __m512 test_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_andnot_ps
-  // CHECK: @llvm.x86.avx512.mask.andn.ps.512
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return (__m512) _mm512_maskz_andnot_ps(__U, __A, __B);
 }
 
@@ -188,19 +222,19 @@ __m512i test_mm512_maskz_cvtpd_epi64(__mmask8 __U, __m512d __A) {
 __m512i test_mm512_cvt_roundpd_epi64(__m512d __A) {
   // CHECK-LABEL: @test_mm512_cvt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
-  return _mm512_cvt_roundpd_epi64(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvt_roundpd_epi64(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_mask_cvt_roundpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
-  return _mm512_mask_cvt_roundpd_epi64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvt_roundpd_epi64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_maskz_cvt_roundpd_epi64(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
-  return _mm512_maskz_cvt_roundpd_epi64(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvt_roundpd_epi64(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_cvtpd_epu64(__m512d __A) {
@@ -224,19 +258,19 @@ __m512i test_mm512_maskz_cvtpd_epu64(__mmask8 __U, __m512d __A) {
 __m512i test_mm512_cvt_roundpd_epu64(__m512d __A) {
   // CHECK-LABEL: @test_mm512_cvt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
-  return _mm512_cvt_roundpd_epu64(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvt_roundpd_epu64(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_mask_cvt_roundpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
-  return _mm512_mask_cvt_roundpd_epu64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvt_roundpd_epu64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_maskz_cvt_roundpd_epu64(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
-  return _mm512_maskz_cvt_roundpd_epu64(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvt_roundpd_epu64(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_cvtps_epi64(__m256 __A) {
@@ -260,19 +294,19 @@ __m512i test_mm512_maskz_cvtps_epi64(__mmask8 __U, __m256 __A) {
 __m512i test_mm512_cvt_roundps_epi64(__m256 __A) {
   // CHECK-LABEL: @test_mm512_cvt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
-  return _mm512_cvt_roundps_epi64(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvt_roundps_epi64(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_mask_cvt_roundps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
-  return _mm512_mask_cvt_roundps_epi64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvt_roundps_epi64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_maskz_cvt_roundps_epi64(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
-  return _mm512_maskz_cvt_roundps_epi64(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvt_roundps_epi64(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_cvtps_epu64(__m256 __A) {
@@ -296,19 +330,19 @@ __m512i test_mm512_maskz_cvtps_epu64(__mmask8 __U, __m256 __A) {
 __m512i test_mm512_cvt_roundps_epu64(__m256 __A) {
   // CHECK-LABEL: @test_mm512_cvt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
-  return _mm512_cvt_roundps_epu64(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvt_roundps_epu64(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_mask_cvt_roundps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
-  return _mm512_mask_cvt_roundps_epu64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvt_roundps_epu64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_maskz_cvt_roundps_epu64(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
-  return _mm512_maskz_cvt_roundps_epu64(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvt_roundps_epu64(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_cvtepi64_pd(__m512i __A) {
@@ -332,19 +366,19 @@ __m512d test_mm512_maskz_cvtepi64_pd(__mmask8 __U, __m512i __A) {
 __m512d test_mm512_cvt_roundepi64_pd(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvt_roundepi64_pd
   // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.512
-  return _mm512_cvt_roundepi64_pd(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvt_roundepi64_pd(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_mask_cvt_roundepi64_pd(__m512d __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvt_roundepi64_pd
   // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.512
-  return _mm512_mask_cvt_roundepi64_pd(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvt_roundepi64_pd(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_maskz_cvt_roundepi64_pd(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvt_roundepi64_pd
   // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.512
-  return _mm512_maskz_cvt_roundepi64_pd(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvt_roundepi64_pd(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_cvtepi64_ps(__m512i __A) {
@@ -368,19 +402,19 @@ __m256 test_mm512_maskz_cvtepi64_ps(__mmask8 __U, __m512i __A) {
 __m256 test_mm512_cvt_roundepi64_ps(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvt_roundepi64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtqq2ps.512
-  return _mm512_cvt_roundepi64_ps(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvt_roundepi64_ps(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_mask_cvt_roundepi64_ps(__m256 __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvt_roundepi64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtqq2ps.512
-  return _mm512_mask_cvt_roundepi64_ps(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvt_roundepi64_ps(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_maskz_cvt_roundepi64_ps(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvt_roundepi64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtqq2ps.512
-  return _mm512_maskz_cvt_roundepi64_ps(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvt_roundepi64_ps(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_cvttpd_epi64(__m512d __A) {
@@ -404,19 +438,19 @@ __m512i test_mm512_maskz_cvttpd_epi64(__mmask8 __U, __m512d __A) {
 __m512i test_mm512_cvtt_roundpd_epi64(__m512d __A) {
   // CHECK-LABEL: @test_mm512_cvtt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
-  return _mm512_cvtt_roundpd_epi64(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvtt_roundpd_epi64(__A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_mask_cvtt_roundpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
-  return _mm512_mask_cvtt_roundpd_epi64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvtt_roundpd_epi64(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_maskz_cvtt_roundpd_epi64(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
-  return _mm512_maskz_cvtt_roundpd_epi64(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvtt_roundpd_epi64(__U, __A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_cvttpd_epu64(__m512d __A) {
@@ -440,19 +474,19 @@ __m512i test_mm512_maskz_cvttpd_epu64(__mmask8 __U, __m512d __A) {
 __m512i test_mm512_cvtt_roundpd_epu64(__m512d __A) {
   // CHECK-LABEL: @test_mm512_cvtt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
-  return _mm512_cvtt_roundpd_epu64(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvtt_roundpd_epu64(__A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_mask_cvtt_roundpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
-  return _mm512_mask_cvtt_roundpd_epu64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvtt_roundpd_epu64(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_maskz_cvtt_roundpd_epu64(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
-  return _mm512_maskz_cvtt_roundpd_epu64(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvtt_roundpd_epu64(__U, __A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_cvttps_epi64(__m256 __A) {
@@ -476,19 +510,19 @@ __m512i test_mm512_maskz_cvttps_epi64(__mmask8 __U, __m256 __A) {
 __m512i test_mm512_cvtt_roundps_epi64(__m256 __A) {
   // CHECK-LABEL: @test_mm512_cvtt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
-  return _mm512_cvtt_roundps_epi64(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvtt_roundps_epi64(__A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_mask_cvtt_roundps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
-  return _mm512_mask_cvtt_roundps_epi64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvtt_roundps_epi64(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_maskz_cvtt_roundps_epi64(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
-  return _mm512_maskz_cvtt_roundps_epi64(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvtt_roundps_epi64(__U, __A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_cvttps_epu64(__m256 __A) {
@@ -512,19 +546,19 @@ __m512i test_mm512_maskz_cvttps_epu64(__mmask8 __U, __m256 __A) {
 __m512i test_mm512_cvtt_roundps_epu64(__m256 __A) {
   // CHECK-LABEL: @test_mm512_cvtt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
-  return _mm512_cvtt_roundps_epu64(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvtt_roundps_epu64(__A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_mask_cvtt_roundps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
-  return _mm512_mask_cvtt_roundps_epu64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvtt_roundps_epu64(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512i test_mm512_maskz_cvtt_roundps_epu64(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
-  return _mm512_maskz_cvtt_roundps_epu64(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvtt_roundps_epu64(__U, __A, _MM_FROUND_CUR_DIRECTION); 
 }
 
 __m512d test_mm512_cvtepu64_pd(__m512i __A) {
@@ -548,19 +582,19 @@ __m512d test_mm512_maskz_cvtepu64_pd(__mmask8 __U, __m512i __A) {
 __m512d test_mm512_cvt_roundepu64_pd(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvt_roundepu64_pd
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.512
-  return _mm512_cvt_roundepu64_pd(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvt_roundepu64_pd(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_mask_cvt_roundepu64_pd(__m512d __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvt_roundepu64_pd
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.512
-  return _mm512_mask_cvt_roundepu64_pd(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvt_roundepu64_pd(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_maskz_cvt_roundepu64_pd(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvt_roundepu64_pd
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.512
-  return _mm512_maskz_cvt_roundepu64_pd(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvt_roundepu64_pd(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_cvtepu64_ps(__m512i __A) {
@@ -584,19 +618,19 @@ __m256 test_mm512_maskz_cvtepu64_ps(__mmask8 __U, __m512i __A) {
 __m256 test_mm512_cvt_roundepu64_ps(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvt_roundepu64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2ps.512
-  return _mm512_cvt_roundepu64_ps(__A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_cvt_roundepu64_ps(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_mask_cvt_roundepu64_ps(__m256 __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvt_roundepu64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2ps.512
-  return _mm512_mask_cvt_roundepu64_ps(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_cvt_roundepu64_ps(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_maskz_cvt_roundepu64_ps(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvt_roundepu64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2ps.512
-  return _mm512_maskz_cvt_roundepu64_ps(__U, __A, _MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_cvt_roundepu64_ps(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_range_pd(__m512d __A, __m512d __B) {
@@ -1020,145 +1054,161 @@ __m512i test_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
 }
 __m256 test_mm512_extractf32x8_ps(__m512 __A) {
   // CHECK-LABEL: @test_mm512_extractf32x8_ps
-  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extractf32x8_ps(__A, 1); 
 }
 
 __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_mask_extractf32x8_ps
-  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); 
 }
 
 __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_maskz_extractf32x8_ps
-  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_extractf32x8_ps(__U, __A, 1); 
 }
 
 __m128d test_mm512_extractf64x2_pd(__m512d __A) {
   // CHECK-LABEL: @test_mm512_extractf64x2_pd
-  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
   return _mm512_extractf64x2_pd(__A, 3); 
 }
 
 __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_extractf64x2_pd
-  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); 
 }
 
 __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_maskz_extractf64x2_pd
-  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_maskz_extractf64x2_pd(__U, __A, 3); 
 }
 
 __m256i test_mm512_extracti32x8_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti32x8_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x8_epi32(__A, 1); 
 }
 
 __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti32x8_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); 
 }
 
 __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti32x8_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); 
 }
 
 __m128i test_mm512_extracti64x2_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti64x2_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
   return _mm512_extracti64x2_epi64(__A, 3); 
 }
 
 __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti64x2_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); 
 }
 
 __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti64x2_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); 
 }
 
 __m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm512_insertf32x8
-  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   return _mm512_insertf32x8(__A, __B, 1); 
 }
 
 __m512 test_mm512_mask_insertf32x8(__m512 __W, __mmask16 __U, __m512 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm512_mask_insertf32x8
-  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_insertf32x8(__W, __U, __A, __B, 1); 
 }
 
 __m512 test_mm512_maskz_insertf32x8(__mmask16 __U, __m512 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm512_maskz_insertf32x8
-  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_insertf32x8(__U, __A, __B, 1); 
 }
 
 __m512d test_mm512_insertf64x2(__m512d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm512_insertf64x2
-  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
   return _mm512_insertf64x2(__A, __B, 3); 
 }
 
 __m512d test_mm512_mask_insertf64x2(__m512d __W, __mmask8 __U, __m512d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm512_mask_insertf64x2
-  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_insertf64x2(__W, __U, __A, __B, 3); 
 }
 
 __m512d test_mm512_maskz_insertf64x2(__mmask8 __U, __m512d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm512_maskz_insertf64x2
-  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_insertf64x2(__U, __A, __B, 3); 
 }
 
 __m512i test_mm512_inserti32x8(__m512i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm512_inserti32x8
-  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   return _mm512_inserti32x8(__A, __B, 1); 
 }
 
 __m512i test_mm512_mask_inserti32x8(__m512i __W, __mmask16 __U, __m512i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm512_mask_inserti32x8
-  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_inserti32x8(__W, __U, __A, __B, 1); 
 }
 
 __m512i test_mm512_maskz_inserti32x8(__mmask16 __U, __m512i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm512_maskz_inserti32x8
-  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_inserti32x8(__U, __A, __B, 1); 
 }
 
 __m512i test_mm512_inserti64x2(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_inserti64x2
-  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
   return _mm512_inserti64x2(__A, __B, 1); 
 }
 
 __m512i test_mm512_mask_inserti64x2(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_inserti64x2
-  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_inserti64x2(__W, __U, __A, __B, 1); 
 }
 
 __m512i test_mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_inserti64x2
-  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_inserti64x2(__U, __A, __B, 1); 
 }
 __mmask8 test_mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A) {
diff --git a/test/CodeGen/avx512er-builtins.c b/test/CodeGen/avx512er-builtins.c
index 7c6b050c8ae5..084e0f53825c 100644
--- a/test/CodeGen/avx512er-builtins.c
+++ b/test/CodeGen/avx512er-builtins.c
@@ -1,26 +1,24 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512er -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512er -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
 __m512d test_mm512_rsqrt28_round_pd(__m512d a) {
   // CHECK-LABEL: @test_mm512_rsqrt28_round_pd
   // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_rsqrt28_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_rsqrt28_round_pd(a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_mask_rsqrt28_round_pd(__m512d s, __mmask8 m, __m512d a) {
   // CHECK-LABEL: @test_mm512_mask_rsqrt28_round_pd
   // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_mask_rsqrt28_round_pd(s, m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_rsqrt28_round_pd(s, m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_maskz_rsqrt28_round_pd(__mmask8 m, __m512d a) {
   // CHECK-LABEL: @test_mm512_maskz_rsqrt28_round_pd
   // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_maskz_rsqrt28_round_pd(m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_rsqrt28_round_pd(m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_rsqrt28_pd(__m512d a) {
@@ -44,19 +42,19 @@ __m512d test_mm512_maskz_rsqrt28_pd(__mmask8 m, __m512d a) {
 __m512 test_mm512_rsqrt28_round_ps(__m512 a) {
   // CHECK-LABEL: @test_mm512_rsqrt28_round_ps
   // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_rsqrt28_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_rsqrt28_round_ps(a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_mask_rsqrt28_round_ps(__m512 s, __mmask16 m, __m512 a) {
   // CHECK-LABEL: @test_mm512_mask_rsqrt28_round_ps
   // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_mask_rsqrt28_round_ps(s, m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_rsqrt28_round_ps(s, m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_maskz_rsqrt28_round_ps(__mmask16 m, __m512 a) {
   // CHECK-LABEL: @test_mm512_maskz_rsqrt28_round_ps
   // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_maskz_rsqrt28_round_ps(m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_rsqrt28_round_ps(m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_rsqrt28_ps(__m512 a) {
@@ -80,19 +78,19 @@ __m512 test_mm512_maskz_rsqrt28_ps(__mmask16 m, __m512 a) {
 __m128 test_mm_rsqrt28_round_ss(__m128 a, __m128 b) {
   // CHECK-LABEL: @test_mm_rsqrt28_round_ss
   // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_rsqrt28_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_rsqrt28_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128 test_mm_mask_rsqrt28_round_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
   // CHECK-LABEL: @test_mm_mask_rsqrt28_round_ss
   // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_mask_rsqrt28_round_ss(s, m, a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_mask_rsqrt28_round_ss(s, m, a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128 test_mm_maskz_rsqrt28_round_ss(__mmask16 m, __m128 a, __m128 b) {
   // CHECK-LABEL: @test_mm_maskz_rsqrt28_round_ss
   // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_maskz_rsqrt28_round_ss(m, a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_maskz_rsqrt28_round_ss(m, a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128 test_mm_rsqrt28_ss(__m128 a, __m128 b) {
@@ -116,37 +114,37 @@ __m128 test_mm_maskz_rsqrt28_ss(__mmask16 m, __m128 a, __m128 b) {
 __m128d test_mm_rsqrt28_round_sd(__m128d a, __m128d b) {
   // CHECK-LABEL: @test_mm_rsqrt28_round_sd
   // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_rsqrt28_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_rsqrt28_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128d test_mm_mask_rsqrt28_round_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) {
   // CHECK-LABEL: @test_mm_mask_rsqrt28_round_sd
   // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_mask_rsqrt28_round_sd(s, m, a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_mask_rsqrt28_round_sd(s, m, a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128d test_mm_maskz_rsqrt28_round_sd(__mmask8 m, __m128d a, __m128d b) {
   // CHECK-LABEL: @test_mm_maskz_rsqrt28_round_sd
   // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_maskz_rsqrt28_round_sd(m, a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_maskz_rsqrt28_round_sd(m, a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_rcp28_round_pd(__m512d a) {
   // CHECK-LABEL: @test_mm512_rcp28_round_pd
   // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_rcp28_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_rcp28_round_pd(a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_mask_rcp28_round_pd(__m512d s, __mmask8 m, __m512d a) {
   // CHECK-LABEL: @test_mm512_mask_rcp28_round_pd
   // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_mask_rcp28_round_pd(s, m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_rcp28_round_pd(s, m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_maskz_rcp28_round_pd(__mmask8 m, __m512d a) {
   // CHECK-LABEL: @test_mm512_maskz_rcp28_round_pd
   // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_maskz_rcp28_round_pd(m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_rcp28_round_pd(m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_rcp28_pd(__m512d a) {
@@ -170,19 +168,19 @@ __m512d test_mm512_maskz_rcp28_pd(__mmask8 m, __m512d a) {
 __m512 test_mm512_rcp28_round_ps(__m512 a) {
   // CHECK-LABEL: @test_mm512_rcp28_round_ps
   // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_rcp28_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_rcp28_round_ps(a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_mask_rcp28_round_ps(__m512 s, __mmask16 m, __m512 a) {
   // CHECK-LABEL: @test_mm512_mask_rcp28_round_ps
   // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_mask_rcp28_round_ps(s, m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_rcp28_round_ps(s, m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_maskz_rcp28_round_ps(__mmask16 m, __m512 a) {
   // CHECK-LABEL: @test_mm512_maskz_rcp28_round_ps
   // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_maskz_rcp28_round_ps(m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_rcp28_round_ps(m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_rcp28_ps(__m512 a) {
@@ -206,19 +204,19 @@ __m512 test_mm512_maskz_rcp28_ps(__mmask16 m, __m512 a) {
 __m128 test_mm_rcp28_round_ss(__m128 a, __m128 b) {
   // CHECK-LABEL: @test_mm_rcp28_round_ss
   // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_rcp28_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_rcp28_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128 test_mm_mask_rcp28_round_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
   // CHECK-LABEL: @test_mm_mask_rcp28_round_ss
   // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_mask_rcp28_round_ss(s, m, a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_mask_rcp28_round_ss(s, m, a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128 test_mm_maskz_rcp28_round_ss(__mmask16 m, __m128 a, __m128 b) {
   // CHECK-LABEL: @test_mm_maskz_rcp28_round_ss
   // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_maskz_rcp28_round_ss(m, a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_maskz_rcp28_round_ss(m, a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128 test_mm_rcp28_ss(__m128 a, __m128 b) {
@@ -242,19 +240,19 @@ __m128 test_mm_maskz_rcp28_ss(__mmask16 m, __m128 a, __m128 b) {
 __m128d test_mm_rcp28_round_sd(__m128d a, __m128d b) {
   // CHECK-LABEL: @test_mm_rcp28_round_sd
   // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_rcp28_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_rcp28_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128d test_mm_mask_rcp28_round_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) {
   // CHECK-LABEL: @test_mm_mask_rcp28_round_sd
   // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_mask_rcp28_round_sd(s, m, a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_mask_rcp28_round_sd(s, m, a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128d test_mm_maskz_rcp28_round_sd(__mmask8 m, __m128d a, __m128d b) {
   // CHECK-LABEL: @test_mm_maskz_rcp28_round_sd
   // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_maskz_rcp28_round_sd(m, a, b, _MM_FROUND_TO_NEAREST_INT);
+  return _mm_maskz_rcp28_round_sd(m, a, b, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m128d test_mm_rcp28_sd(__m128d a, __m128d b) {
@@ -278,19 +276,19 @@ __m128d test_mm_maskz_rcp28_sd(__mmask8 m, __m128d a, __m128d b) {
 __m512d test_mm512_exp2a23_round_pd(__m512d a) {
   // CHECK-LABEL: @test_mm512_exp2a23_round_pd
   // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_exp2a23_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_exp2a23_round_pd(a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_mask_exp2a23_round_pd(__m512d s, __mmask8 m, __m512d a) {
   // CHECK-LABEL: @test_mm512_mask_exp2a23_round_pd
   // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_mask_exp2a23_round_pd(s, m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_exp2a23_round_pd(s, m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_maskz_exp2a23_round_pd(__mmask8 m, __m512d a) {
   // CHECK-LABEL: @test_mm512_maskz_exp2a23_round_pd
   // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_maskz_exp2a23_round_pd(m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_exp2a23_round_pd(m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_exp2a23_pd(__m512d a) {
@@ -314,19 +312,19 @@ __m512d test_mm512_maskz_exp2a23_pd(__mmask8 m, __m512d a) {
 __m512 test_mm512_exp2a23_round_ps(__m512 a) {
   // CHECK-LABEL: @test_mm512_exp2a23_round_ps
   // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_exp2a23_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_exp2a23_round_ps(a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_mask_exp2a23_round_ps(__m512 s, __mmask16 m, __m512 a) {
   // CHECK-LABEL: @test_mm512_mask_exp2a23_round_ps
   // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_mask_exp2a23_round_ps(s, m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_exp2a23_round_ps(s, m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_maskz_exp2a23_round_ps(__mmask16 m, __m512 a) {
   // CHECK-LABEL: @test_mm512_maskz_exp2a23_round_ps
   // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_maskz_exp2a23_round_ps(m, a, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_exp2a23_round_ps(m, a, _MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_exp2a23_ps(__m512 a) {
diff --git a/test/CodeGen/avx512f-builtins.c b/test/CodeGen/avx512f-builtins.c
index 556c06f532cf..760783af1ce0 100644
--- a/test/CodeGen/avx512f-builtins.c
+++ b/test/CodeGen/avx512f-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Werror | FileCheck %s
-
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -O2 -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=O2
 
 #include <immintrin.h>
 
@@ -395,105 +393,109 @@ __mmask16 test_mm512_knot(__mmask16 a)
 __m512i test_mm512_alignr_epi32(__m512i a, __m512i b)
 {
   // CHECK-LABEL: @test_mm512_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.512
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm512_alignr_epi32(a, b, 2);
 }
 
 __m512i test_mm512_mask_alignr_epi32(__m512i w, __mmask16 u, __m512i a, __m512i b)
 {
   // CHECK-LABEL: @test_mm512_mask_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.512
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> {{.*}}
   return _mm512_mask_alignr_epi32(w, u, a, b, 2);
 }
 
 __m512i test_mm512_maskz_alignr_epi32( __mmask16 u, __m512i a, __m512i b)
 {
   // CHECK-LABEL: @test_mm512_maskz_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.512
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> {{.*}}
   return _mm512_maskz_alignr_epi32(u, a, b, 2);
 }
 
 __m512i test_mm512_alignr_epi64(__m512i a, __m512i b)
 {
   // CHECK-LABEL: @test_mm512_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.512
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   return _mm512_alignr_epi64(a, b, 2);
 }
 
 __m512i test_mm512_mask_alignr_epi64(__m512i w, __mmask8 u, __m512i a, __m512i b)
 {
   // CHECK-LABEL: @test_mm512_mask_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.512
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> {{.*}}
   return _mm512_mask_alignr_epi64(w, u, a, b, 2);
 }
 
 __m512i test_mm512_maskz_alignr_epi64( __mmask8 u, __m512i a, __m512i b)
 {
   // CHECK-LABEL: @test_mm512_maskz_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.512
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> {{.*}}
   return _mm512_maskz_alignr_epi64(u, a, b, 2);
 }
 
 __m512d test_mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmadd_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
-  return _mm512_fmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512d test_mm512_mask_fmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmadd_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
-  return _mm512_mask_fmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmadd_round_pd
   // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512
-  return _mm512_mask3_fmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pd
   // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
-  return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmsub_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
-  return _mm512_fmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsub_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
-  return _mm512_mask_fmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsub_round_pd
   // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
-  return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fnmadd_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
-  return _mm512_fnmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fnmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_pd
   // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512
-  return _mm512_mask3_fnmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fnmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_pd
   // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
-  return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fnmsub_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
-  return _mm512_fnmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fnmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_pd
   // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
-  return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmadd_pd
@@ -558,62 +560,62 @@ __m512d test_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m51
 __m512 test_mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmadd_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
-  return _mm512_fmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmadd_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
-  return _mm512_mask_fmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmadd_round_ps
   // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512
-  return _mm512_mask3_fmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ps
   // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
-  return _mm512_maskz_fmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmsub_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
-  return _mm512_fmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsub_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
-  return _mm512_mask_fmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsub_round_ps
   // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
-  return _mm512_maskz_fmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fnmadd_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
-  return _mm512_fnmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fnmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_ps
   // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512
-  return _mm512_mask3_fnmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fnmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_ps
   // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
-  return _mm512_maskz_fnmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fnmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fnmsub_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
-  return _mm512_fnmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fnmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_ps
   // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
-  return _mm512_maskz_fnmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fnmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmadd_ps
@@ -678,37 +680,37 @@ __m512 test_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512
 __m512d test_mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmaddsub_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
-  return _mm512_fmaddsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fmaddsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fmaddsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
-  return _mm512_mask_fmaddsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fmaddsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_pd
   // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.512
-  return _mm512_mask3_fmaddsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fmaddsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_pd
   // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512
-  return _mm512_maskz_fmaddsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fmaddsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmsubadd_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
-  return _mm512_fmsubadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fmsubadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fmsubadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
-  return _mm512_mask_fmsubadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fmsubadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_pd
   // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512
-  return _mm512_maskz_fmsubadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fmsubadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmaddsub_pd
@@ -748,37 +750,37 @@ __m512d test_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m
 __m512 test_mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmaddsub_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
-  return _mm512_fmaddsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fmaddsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fmaddsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
-  return _mm512_mask_fmaddsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fmaddsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_ps
   // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.512
-  return _mm512_mask3_fmaddsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fmaddsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ps
   // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512
-  return _mm512_maskz_fmaddsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fmaddsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmsubadd_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
-  return _mm512_fmsubadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_fmsubadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fmsubadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
-  return _mm512_mask_fmsubadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fmsubadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_ps
   // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512
-  return _mm512_maskz_fmsubadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_maskz_fmsubadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmaddsub_ps
@@ -818,7 +820,7 @@ __m512 test_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m51
 __m512d test_mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsub_round_pd
   // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.512
-  return _mm512_mask3_fmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsub_pd
@@ -828,7 +830,7 @@ __m512d test_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask
 __m512 test_mm512_mask3_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsub_round_ps
   // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.512
-  return _mm512_mask3_fmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsub_ps
@@ -838,7 +840,7 @@ __m512 test_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 _
 __m512d test_mm512_mask3_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_pd
   // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.512
-  return _mm512_mask3_fmsubadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fmsubadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsubadd_pd
@@ -848,7 +850,7 @@ __m512d test_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mm
 __m512 test_mm512_mask3_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_ps
   // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.512
-  return _mm512_mask3_fmsubadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fmsubadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsubadd_ps
@@ -858,7 +860,7 @@ __m512 test_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask1
 __m512d test_mm512_mask_fnmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmadd_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.512
-  return _mm512_mask_fnmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fnmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmadd_pd
@@ -868,7 +870,7 @@ __m512d test_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512
 __m512 test_mm512_mask_fnmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmadd_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.512
-  return _mm512_mask_fnmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fnmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmadd_ps
@@ -878,12 +880,12 @@ __m512 test_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 _
 __m512d test_mm512_mask_fnmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmsub_round_pd
   // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.512
-  return _mm512_mask_fnmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fnmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_pd
   // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.512
-  return _mm512_mask3_fnmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fnmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmsub_pd
@@ -898,12 +900,12 @@ __m512d test_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmas
 __m512 test_mm512_mask_fnmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmsub_round_ps
   // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.512
-  return _mm512_mask_fnmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_fnmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_ps
   // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.512
-  return _mm512_mask3_fnmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask3_fnmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmsub_ps
@@ -1044,41 +1046,237 @@ __mmask8 test_mm512_mask_cmp_pd_mask(__mmask8 m, __m512d a, __m512d b) {
   return _mm512_mask_cmp_pd_mask(m, a, b, 0);
 }
 
+__mmask8 test_mm512_cmpeq_pd_mask(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmpeq_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_cmpeq_pd_mask(a, b);
+}
+
+__mmask8 test_mm512_cmpeq_ps_mask(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmpeq_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_cmpeq_ps_mask(a, b);
+}
+
+__mmask8 test_mm512_mask_cmpeq_pd_mask(__mmask8 k, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpeq_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_mask_cmpeq_pd_mask(k, a, b);
+}
+
+__mmask8 test_mm512_mask_cmpeq_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpeq_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_mask_cmpeq_ps_mask(k, a, b);
+}
+
+__mmask8 test_mm512_cmple_pd_mask(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmple_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_cmpeq_pd_mask(a, b);
+}
+
+__mmask8 test_mm512_cmple_ps_mask(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmple_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_cmpeq_ps_mask(a, b);
+}
+
+__mmask8 test_mm512_mask_cmple_pd_mask(__mmask8 k, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmple_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_mask_cmple_pd_mask(k, a, b);
+}
+
+__mmask8 test_mm512_mask_cmple_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmple_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_mask_cmple_ps_mask(k, a, b);
+}
+
+__mmask8 test_mm512_cmplt_pd_mask(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmplt_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_cmplt_pd_mask(a, b);
+}
+
+__mmask8 test_mm512_cmplt_ps_mask(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmplt_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_cmplt_ps_mask(a, b);
+}
+
+__mmask8 test_mm512_mask_cmplt_pd_mask(__mmask8 k, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmplt_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_mask_cmplt_pd_mask(k, a, b);
+}
+
+__mmask8 test_mm512_mask_cmplt_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmplt_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_mask_cmplt_ps_mask(k, a, b);
+}
+
+__mmask8 test_mm512_cmpneq_pd_mask(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmpneq_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_cmpneq_pd_mask(a, b);
+}
+
+__mmask8 test_mm512_cmpneq_ps_mask(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmpneq_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_cmpneq_ps_mask(a, b);
+}
+
+__mmask8 test_mm512_mask_cmpneq_pd_mask(__mmask8 k, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpneq_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_mask_cmpneq_pd_mask(k, a, b);
+}
+
+__mmask8 test_mm512_mask_cmpneq_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpneq_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_mask_cmpneq_ps_mask(k, a, b);
+}
+
+__mmask8 test_mm512_cmpnle_pd_mask(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmpnle_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_cmpnle_pd_mask(a, b);
+}
+
+__mmask8 test_mm512_cmpnle_ps_mask(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmpnle_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_cmpnle_ps_mask(a, b);
+}
+
+__mmask8 test_mm512_mask_cmpnle_pd_mask(__mmask8 k, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpnle_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_mask_cmpnle_pd_mask(k, a, b);
+}
+
+__mmask8 test_mm512_mask_cmpnle_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpnle_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_mask_cmpnle_ps_mask(k, a, b);
+}
+
+__mmask8 test_mm512_cmpnlt_pd_mask(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmpnlt_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_cmpnlt_pd_mask(a, b);
+}
+
+__mmask8 test_mm512_cmpnlt_ps_mask(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmpnlt_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_cmpnlt_ps_mask(a, b);
+}
+
+__mmask8 test_mm512_mask_cmpnlt_pd_mask(__mmask8 k, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpnlt_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_mask_cmpnlt_pd_mask(k, a, b);
+}
+
+__mmask8 test_mm512_mask_cmpnlt_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpnlt_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_mask_cmpnlt_ps_mask(k, a, b);
+}
+
+__mmask8 test_mm512_cmpord_pd_mask(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmpord_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_cmpord_pd_mask(a, b);
+}
+
+__mmask8 test_mm512_cmpord_ps_mask(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmpord_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_cmpord_ps_mask(a, b);
+}
+
+__mmask8 test_mm512_mask_cmpord_pd_mask(__mmask8 k, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpord_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_mask_cmpord_pd_mask(k, a, b);
+}
+
+__mmask8 test_mm512_mask_cmpord_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpord_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_mask_cmpord_ps_mask(k, a, b);
+}
+
+__mmask8 test_mm512_cmpunord_pd_mask(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmpunord_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_cmpunord_pd_mask(a, b);
+}
+
+__mmask8 test_mm512_cmpunord_ps_mask(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmpunord_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_cmpunord_ps_mask(a, b);
+}
+
+__mmask8 test_mm512_mask_cmpunord_pd_mask(__mmask8 k, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpunord_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  return _mm512_mask_cmpunord_pd_mask(k, a, b);
+}
+
+__mmask8 test_mm512_mask_cmpunord_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmpunord_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  return _mm512_mask_cmpunord_ps_mask(k, a, b);
+}
+
 __m256d test_mm512_extractf64x4_pd(__m512d a)
 {
   // CHECK-LABEL: @test_mm512_extractf64x4_pd
-  // CHECK: @llvm.x86.avx512.mask.vextractf64x4.512
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf64x4_pd(a, 1);
 }
 
 __m256d test_mm512_mask_extractf64x4_pd(__m256d  __W,__mmask8  __U,__m512d __A){
- //CHECK-LABEL:@test_mm512_mask_extractf64x4_pd
- //CHECL:@llvm.x86.avx512.mask.vextractf64x4.512
- return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
+  // CHECK-LABEL:@test_mm512_mask_extractf64x4_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
 }
 
 __m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
- //CHECK-LABEL:@test_mm512_maskz_extractf64x4_pd
- //CHECL:@llvm.x86.avx512.mask.vextractf64x4.512
- return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
+  // CHECK-LABEL:@test_mm512_maskz_extractf64x4_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
 }
 
 __m128 test_mm512_extractf32x4_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_extractf32x4_ps
-  // CHECK: @llvm.x86.avx512.mask.vextractf32x4.512
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf32x4_ps(a, 1);
 }
 
 __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512d __A){
- //CHECK-LABEL:@test_mm512_mask_extractf32x4_ps
- //CHECL: @llvm.x86.avx512.mask.vextractf32x4.512
- return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
+  // CHECK-LABEL:@test_mm512_mask_extractf32x4_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
 }
 
 __m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512d __A){
- //CHECK-LABEL:@test_mm512_maskz_extractf32x4_ps
- //CHECL: @llvm.x86.avx512.mask.vextractf32x4.512
+  // CHECK-LABEL:@test_mm512_maskz_extractf32x4_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
  return _mm512_maskz_extractf32x4_ps(  __U, __A, 1);
 }
 
@@ -1342,30 +1540,30 @@ __mmask8 test_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __
   return (__mmask8)_mm512_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
-__mmask16 test_mm512_cmp_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epi32_mask
+__mmask16 test_mm512_cmp_eq_epi32_mask(__m512i __a, __m512i __b) {
+  // CHECK-LABEL: @test_mm512_cmp_eq_epi32_mask
   // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
-  return (__mmask16)_mm512_cmp_epi32_mask(__a, __b, 0);
+  return (__mmask16)_mm512_cmp_epi32_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask16 test_mm512_mask_cmp_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epi32_mask
+__mmask16 test_mm512_mask_cmp_eq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_eq_epi32_mask
   // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
-  return (__mmask16)_mm512_mask_cmp_epi32_mask(__u, __a, __b, 0);
+  return (__mmask16)_mm512_mask_cmp_epi32_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm512_cmp_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epi64_mask
+__mmask8 test_mm512_cmp_eq_epi64_mask(__m512i __a, __m512i __b) {
+  // CHECK-LABEL: @test_mm512_cmp_eq_epi64_mask
   // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm512_cmp_epi64_mask(__a, __b, 0);
+  return (__mmask8)_mm512_cmp_epi64_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm512_mask_cmp_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epi64_mask
+__mmask8 test_mm512_mask_cmp_eq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_eq_epi64_mask
   // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm512_mask_cmp_epi64_mask(__u, __a, __b, 0);
+  return (__mmask8)_mm512_mask_cmp_epi64_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
 __mmask16 test_mm512_cmp_epu32_mask(__m512i __a, __m512i __b) {
@@ -1587,14 +1785,16 @@ __m512i test_mm512_andnot_epi64(__m512i __A, __m512i __B) {
 
 __m512i test_mm512_maskz_sub_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_sub_epi32
-  //CHECK: @llvm.x86.avx512.mask.psub.d.512
+  //CHECK: sub <16 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_sub_epi32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_sub_epi32 (__mmask16 __k,__m512i __A, __m512i __B, 
                                    __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_sub_epi32
-  //CHECK: @llvm.x86.avx512.mask.psub.d.512
+  //CHECK: sub <16 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_sub_epi32(__src,__k,__A,__B);
 }
 
@@ -1606,14 +1806,16 @@ __m512i test_mm512_sub_epi32(__m512i __A, __m512i __B) {
 
 __m512i test_mm512_maskz_sub_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_sub_epi64
-  //CHECK: @llvm.x86.avx512.mask.psub.q.512
+  //CHECK: sub <8 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_sub_epi64(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_sub_epi64 (__mmask8 __k,__m512i __A, __m512i __B, 
                                    __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_sub_epi64
-  //CHECK: @llvm.x86.avx512.mask.psub.q.512
+  //CHECK: sub <8 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_sub_epi64(__src,__k,__A,__B);
 }
 
@@ -1625,14 +1827,16 @@ __m512i test_mm512_sub_epi64(__m512i __A, __m512i __B) {
 
 __m512i test_mm512_maskz_add_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_add_epi32
-  //CHECK: @llvm.x86.avx512.mask.padd.d.512
+  //CHECK: add <16 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_add_epi32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_add_epi32 (__mmask16 __k,__m512i __A, __m512i __B, 
                                    __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_add_epi32
-  //CHECK: @llvm.x86.avx512.mask.padd.d.512
+  //CHECK: add <16 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_add_epi32(__src,__k,__A,__B);
 }
 
@@ -1644,14 +1848,16 @@ __m512i test_mm512_add_epi32(__m512i __A, __m512i __B) {
 
 __m512i test_mm512_maskz_add_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_add_epi64
-  //CHECK: @llvm.x86.avx512.mask.padd.q.512
+  //CHECK: add <8 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_add_epi64(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_add_epi64 (__mmask8 __k,__m512i __A, __m512i __B, 
                                    __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_add_epi64
-  //CHECK: @llvm.x86.avx512.mask.padd.q.512
+  //CHECK: add <8 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_add_epi64(__src,__k,__A,__B);
 }
 
@@ -1661,41 +1867,59 @@ __m512i test_mm512_add_epi64(__m512i __A, __m512i __B) {
   return _mm512_add_epi64(__A,__B);
 }
 
+__m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
+  //CHECK-LABEL: @test_mm512_mul_epi32
+  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  return _mm512_mul_epi32(__A,__B);
+}
+
 __m512i test_mm512_maskz_mul_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_mul_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmul.dq.512
+  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_mul_epi32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_mul_epi32 (__mmask16 __k,__m512i __A, __m512i __B,
                                    __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_mul_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmul.dq.512
+  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_mul_epi32(__src,__k,__A,__B);
 }
 
+__m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) {
+  //CHECK-LABEL: @test_mm512_mul_epu32
+  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  return _mm512_mul_epu32(__A,__B);
+}
+
 __m512i test_mm512_maskz_mul_epu32 (__mmask16 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_mul_epu32
-  //CHECK: @llvm.x86.avx512.mask.pmulu.dq.512
+  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_mul_epu32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_mul_epu32 (__mmask16 __k,__m512i __A, __m512i __B, 
                                    __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_mul_epu32
-  //CHECK: @llvm.x86.avx512.mask.pmulu.dq.512
+  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_mul_epu32(__src,__k,__A,__B);
 }
 
 __m512i test_mm512_maskz_mullo_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_mullo_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmull.d.512
+  //CHECK: mul <16 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_mullo_epi32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_mullo_epi32 (__mmask16 __k,__m512i __A, __m512i __B, __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_mullo_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmull.d.512
+  //CHECK: mul <16 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_mullo_epi32(__src,__k,__A,__B);
 }
 
@@ -1708,67 +1932,71 @@ __m512i test_mm512_mullo_epi32(__m512i __A, __m512i __B) {
 __m512d test_mm512_add_round_pd(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_add_round_pd
   // CHECK: @llvm.x86.avx512.mask.add.pd.512
-  return _mm512_add_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_add_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_add_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_add_round_pd
   // CHECK: @llvm.x86.avx512.mask.add.pd.512
-  return _mm512_mask_add_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_add_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_maskz_add_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_add_round_pd
   // CHECK: @llvm.x86.avx512.mask.add.pd.512
-  return _mm512_maskz_add_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_add_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_add_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.512
+  // CHECK: fadd <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_add_pd(__W,__U,__A,__B); 
 }
 __m512d test_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_add_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.512
+  // CHECK: fadd <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_add_pd(__U,__A,__B); 
 }
 __m512 test_mm512_add_round_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_add_round_ps
   // CHECK: @llvm.x86.avx512.mask.add.ps.512
-  return _mm512_add_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_add_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_add_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_add_round_ps
   // CHECK: @llvm.x86.avx512.mask.add.ps.512
-  return _mm512_mask_add_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_add_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_maskz_add_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_add_round_ps
   // CHECK: @llvm.x86.avx512.mask.add.ps.512
-  return _mm512_maskz_add_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_add_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_add_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.512
+  // CHECK: fadd <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_add_ps(__W,__U,__A,__B); 
 }
 __m512 test_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_add_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.512
+  // CHECK: fadd <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_add_ps(__U,__A,__B); 
 }
 __m128 test_mm_add_round_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_add_round_ss
   // CHECK: @llvm.x86.avx512.mask.add.ss.round
-  return _mm_add_round_ss(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_add_round_ss(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_mask_add_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_add_round_ss
   // CHECK: @llvm.x86.avx512.mask.add.ss.round
-  return _mm_mask_add_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mask_add_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_maskz_add_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_add_round_ss
   // CHECK: @llvm.x86.avx512.mask.add.ss.round
-  return _mm_maskz_add_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_maskz_add_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_add_ss
@@ -1783,17 +2011,17 @@ __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) {
 __m128d test_mm_add_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_add_round_sd
   // CHECK: @llvm.x86.avx512.mask.add.sd.round
-  return _mm_add_round_sd(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_add_round_sd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_mask_add_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_add_round_sd
   // CHECK: @llvm.x86.avx512.mask.add.sd.round
-  return _mm_mask_add_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mask_add_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_maskz_add_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_add_round_sd
   // CHECK: @llvm.x86.avx512.mask.add.sd.round
-  return _mm_maskz_add_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_maskz_add_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_add_sd
@@ -1808,67 +2036,71 @@ __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 __m512d test_mm512_sub_round_pd(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_sub_round_pd
   // CHECK: @llvm.x86.avx512.mask.sub.pd.512
-  return _mm512_sub_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_sub_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_sub_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_sub_round_pd
   // CHECK: @llvm.x86.avx512.mask.sub.pd.512
-  return _mm512_mask_sub_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_sub_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_maskz_sub_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_sub_round_pd
   // CHECK: @llvm.x86.avx512.mask.sub.pd.512
-  return _mm512_maskz_sub_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_sub_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_sub_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.512
+  // CHECK: fsub <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_sub_pd(__W,__U,__A,__B); 
 }
 __m512d test_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_sub_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.512
+  // CHECK: fsub <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_sub_pd(__U,__A,__B); 
 }
 __m512 test_mm512_sub_round_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_sub_round_ps
   // CHECK: @llvm.x86.avx512.mask.sub.ps.512
-  return _mm512_sub_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_sub_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_sub_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_sub_round_ps
   // CHECK: @llvm.x86.avx512.mask.sub.ps.512
-  return _mm512_mask_sub_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_sub_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_maskz_sub_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_sub_round_ps
   // CHECK: @llvm.x86.avx512.mask.sub.ps.512
-  return _mm512_maskz_sub_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_sub_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_sub_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.512
+  // CHECK: fsub <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_sub_ps(__W,__U,__A,__B); 
 }
 __m512 test_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_sub_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.512
+  // CHECK: fsub <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_sub_ps(__U,__A,__B); 
 }
 __m128 test_mm_sub_round_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_sub_round_ss
   // CHECK: @llvm.x86.avx512.mask.sub.ss.round
-  return _mm_sub_round_ss(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_sub_round_ss(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_mask_sub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_sub_round_ss
   // CHECK: @llvm.x86.avx512.mask.sub.ss.round
-  return _mm_mask_sub_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mask_sub_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_maskz_sub_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_sub_round_ss
   // CHECK: @llvm.x86.avx512.mask.sub.ss.round
-  return _mm_maskz_sub_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_maskz_sub_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_sub_ss
@@ -1883,17 +2115,17 @@ __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) {
 __m128d test_mm_sub_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_sub_round_sd
   // CHECK: @llvm.x86.avx512.mask.sub.sd.round
-  return _mm_sub_round_sd(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_sub_round_sd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_mask_sub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_sub_round_sd
   // CHECK: @llvm.x86.avx512.mask.sub.sd.round
-  return _mm_mask_sub_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mask_sub_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_maskz_sub_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_sub_round_sd
   // CHECK: @llvm.x86.avx512.mask.sub.sd.round
-  return _mm_maskz_sub_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_maskz_sub_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_sub_sd
@@ -1908,67 +2140,71 @@ __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 __m512d test_mm512_mul_round_pd(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mul_round_pd
   // CHECK: @llvm.x86.avx512.mask.mul.pd.512
-  return _mm512_mul_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mul_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_mul_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_mul_round_pd
   // CHECK: @llvm.x86.avx512.mask.mul.pd.512
-  return _mm512_mask_mul_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_mul_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_maskz_mul_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_mul_round_pd
   // CHECK: @llvm.x86.avx512.mask.mul.pd.512
-  return _mm512_maskz_mul_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_mul_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_mul_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd.512
+  // CHECK: fmul <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_mul_pd(__W,__U,__A,__B); 
 }
 __m512d test_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_mul_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd.512
+  // CHECK: fmul <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_mul_pd(__U,__A,__B); 
 }
 __m512 test_mm512_mul_round_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mul_round_ps
   // CHECK: @llvm.x86.avx512.mask.mul.ps.512
-  return _mm512_mul_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mul_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_mul_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_mul_round_ps
   // CHECK: @llvm.x86.avx512.mask.mul.ps.512
-  return _mm512_mask_mul_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_mul_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_maskz_mul_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_mul_round_ps
   // CHECK: @llvm.x86.avx512.mask.mul.ps.512
-  return _mm512_maskz_mul_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_mul_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_mul_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps.512
+  // CHECK: fmul <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_mul_ps(__W,__U,__A,__B); 
 }
 __m512 test_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_mul_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps.512
+  // CHECK: fmul <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_mul_ps(__U,__A,__B); 
 }
 __m128 test_mm_mul_round_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mul_round_ss
   // CHECK: @llvm.x86.avx512.mask.mul.ss.round
-  return _mm_mul_round_ss(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mul_round_ss(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_mask_mul_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_mul_round_ss
   // CHECK: @llvm.x86.avx512.mask.mul.ss.round
-  return _mm_mask_mul_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mask_mul_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_maskz_mul_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_mul_round_ss
   // CHECK: @llvm.x86.avx512.mask.mul.ss.round
-  return _mm_maskz_mul_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_maskz_mul_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_mul_ss
@@ -1983,17 +2219,17 @@ __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) {
 __m128d test_mm_mul_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mul_round_sd
   // CHECK: @llvm.x86.avx512.mask.mul.sd.round
-  return _mm_mul_round_sd(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mul_round_sd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_mask_mul_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_mul_round_sd
   // CHECK: @llvm.x86.avx512.mask.mul.sd.round
-  return _mm_mask_mul_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mask_mul_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_maskz_mul_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_mul_round_sd
   // CHECK: @llvm.x86.avx512.mask.mul.sd.round
-  return _mm_maskz_mul_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_maskz_mul_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_mul_sd
@@ -2008,17 +2244,17 @@ __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 __m512d test_mm512_div_round_pd(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_div_round_pd
   // CHECK: @llvm.x86.avx512.mask.div.pd.512
-  return _mm512_div_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_div_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_div_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_div_round_pd
   // CHECK: @llvm.x86.avx512.mask.div.pd.512
-  return _mm512_mask_div_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_div_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_maskz_div_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_div_round_pd
   // CHECK: @llvm.x86.avx512.mask.div.pd.512
-  return _mm512_maskz_div_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_div_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_div_pd(__m512d __a, __m512d __b) {
   // CHECK-LABLE: @test_mm512_div_pd
@@ -2027,28 +2263,30 @@ __m512d test_mm512_div_pd(__m512d __a, __m512d __b) {
 }
 __m512d test_mm512_mask_div_pd(__m512d __w, __mmask8 __u, __m512d __a, __m512d __b) {
   // CHECK-LABLE: @test_mm512_mask_div_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.512
+  // CHECK: fdiv <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_div_pd(__w,__u,__a,__b); 
 }
 __m512d test_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_div_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.512
+  // CHECK: fdiv <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_div_pd(__U,__A,__B); 
 }
 __m512 test_mm512_div_round_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_div_round_ps
   // CHECK: @llvm.x86.avx512.mask.div.ps.512
-  return _mm512_div_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_div_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_div_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_div_round_ps
   // CHECK: @llvm.x86.avx512.mask.div.ps.512
-  return _mm512_mask_div_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_mask_div_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_maskz_div_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_div_round_ps
   // CHECK: @llvm.x86.avx512.mask.div.ps.512
-  return _mm512_maskz_div_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm512_maskz_div_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_div_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_div_ps
@@ -2057,28 +2295,30 @@ __m512 test_mm512_div_ps(__m512 __A, __m512 __B) {
 }
 __m512 test_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_div_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.512
+  // CHECK: fdiv <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_div_ps(__W,__U,__A,__B); 
 }
 __m512 test_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_div_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.512
+  // CHECK: fdiv <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_div_ps(__U,__A,__B); 
 }
 __m128 test_mm_div_round_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_div_round_ss
   // CHECK: @llvm.x86.avx512.mask.div.ss.round
-  return _mm_div_round_ss(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_div_round_ss(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_mask_div_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_div_round_ss
   // CHECK: @llvm.x86.avx512.mask.div.ss.round
-  return _mm_mask_div_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mask_div_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_div_round_ss
   // CHECK: @llvm.x86.avx512.mask.div.ss.round
-  return _mm_maskz_div_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_maskz_div_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128 test_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_div_ss
@@ -2093,17 +2333,17 @@ __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) {
 __m128d test_mm_div_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_div_round_sd
   // CHECK: @llvm.x86.avx512.mask.div.sd.round
-  return _mm_div_round_sd(__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_div_round_sd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_mask_div_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_div_round_sd
   // CHECK: @llvm.x86.avx512.mask.div.sd.round
-  return _mm_mask_div_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_mask_div_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_maskz_div_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_div_round_sd
   // CHECK: @llvm.x86.avx512.mask.div.sd.round
-  return _mm_maskz_div_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
+  return _mm_maskz_div_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m128d test_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_div_sd
@@ -2242,181 +2482,201 @@ __m512i test_mm512_undefined_epi32() {
 
 __m512i test_mm512_cvtepi8_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  // CHECK: sext <16 x i8> %{{.*}} to <16 x i32>
   return _mm512_cvtepi8_epi32(__A); 
 }
 
 __m512i test_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  // CHECK: sext <16 x i8> %{{.*}} to <16 x i32>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_cvtepi8_epi32(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  // CHECK: sext <16 x i8> %{{.*}} to <16 x i32>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_cvtepi8_epi32(__U, __A); 
 }
 
 __m512i test_mm512_cvtepi8_epi64(__m128i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i64>
   return _mm512_cvtepi8_epi64(__A); 
 }
 
 __m512i test_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_cvtepi8_epi64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_cvtepi8_epi64(__U, __A); 
 }
 
 __m512i test_mm512_cvtepi32_epi64(__m256i __X) {
   // CHECK-LABEL: @test_mm512_cvtepi32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  // CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
   return _mm512_cvtepi32_epi64(__X); 
 }
 
 __m512i test_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  // CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_cvtepi32_epi64(__W, __U, __X); 
 }
 
 __m512i test_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  // CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_cvtepi32_epi64(__U, __X); 
 }
 
 __m512i test_mm512_cvtepi16_epi32(__m256i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
   return _mm512_cvtepi16_epi32(__A); 
 }
 
 __m512i test_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_cvtepi16_epi32(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_cvtepi16_epi32(__U, __A); 
 }
 
 __m512i test_mm512_cvtepi16_epi64(__m128i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  // CHECK: sext <8 x i16> %{{.*}} to <8 x i64>
   return _mm512_cvtepi16_epi64(__A); 
 }
 
 __m512i test_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  // CHECK: sext <8 x i16> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_cvtepi16_epi64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  // CHECK: sext <8 x i16> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_cvtepi16_epi64(__U, __A); 
 }
 
 __m512i test_mm512_cvtepu8_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm512_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  // CHECK: zext <16 x i8> %{{.*}} to <16 x i32>
   return _mm512_cvtepu8_epi32(__A); 
 }
 
 __m512i test_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  // CHECK: zext <16 x i8> %{{.*}} to <16 x i32>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_cvtepu8_epi32(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  // CHECK: zext <16 x i8> %{{.*}} to <16 x i32>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_cvtepu8_epi32(__U, __A); 
 }
 
 __m512i test_mm512_cvtepu8_epi64(__m128i __A) {
   // CHECK-LABEL: @test_mm512_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i64>
   return _mm512_cvtepu8_epi64(__A); 
 }
 
 __m512i test_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_cvtepu8_epi64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_cvtepu8_epi64(__U, __A); 
 }
 
 __m512i test_mm512_cvtepu32_epi64(__m256i __X) {
   // CHECK-LABEL: @test_mm512_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  // CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
   return _mm512_cvtepu32_epi64(__X); 
 }
 
 __m512i test_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) {
   // CHECK-LABEL: @test_mm512_mask_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  // CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_cvtepu32_epi64(__W, __U, __X); 
 }
 
 __m512i test_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  // CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_cvtepu32_epi64(__U, __X); 
 }
 
 __m512i test_mm512_cvtepu16_epi32(__m256i __A) {
   // CHECK-LABEL: @test_mm512_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  // CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
   return _mm512_cvtepu16_epi32(__A); 
 }
 
 __m512i test_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  // CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_cvtepu16_epi32(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  // CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_cvtepu16_epi32(__U, __A); 
 }
 
 __m512i test_mm512_cvtepu16_epi64(__m128i __A) {
   // CHECK-LABEL: @test_mm512_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  // CHECK: zext <8 x i16> %{{.*}} to <8 x i64>
   return _mm512_cvtepu16_epi64(__A); 
 }
 
 __m512i test_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  // CHECK: zext <8 x i16> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_cvtepu16_epi64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  // CHECK: zext <8 x i16> %{{.*}} to <8 x i64>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_cvtepu16_epi64(__U, __A); 
 }
 
@@ -2568,73 +2828,80 @@ __m512i test_mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
 
 __m512i test_mm512_slli_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_slli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  // CHECK: @llvm.x86.avx512.pslli.d.512
   return _mm512_slli_epi32(__A, 5); 
 }
 
 __m512i test_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_slli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  // CHECK: @llvm.x86.avx512.pslli.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_slli_epi32(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_slli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  // CHECK: @llvm.x86.avx512.pslli.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_slli_epi32(__U, __A, 5); 
 }
 
 __m512i test_mm512_slli_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_slli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  // CHECK: @llvm.x86.avx512.pslli.q.512
   return _mm512_slli_epi64(__A, 5); 
 }
 
 __m512i test_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_slli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  // CHECK: @llvm.x86.avx512.pslli.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_slli_epi64(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_slli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_slli_epi64(__U, __A, 5); 
 }
 
 __m512i test_mm512_srli_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_srli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  // CHECK: @llvm.x86.avx512.psrli.d.512
   return _mm512_srli_epi32(__A, 5); 
 }
 
 __m512i test_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_srli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  // CHECK: @llvm.x86.avx512.psrli.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_srli_epi32(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_srli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  // CHECK: @llvm.x86.avx512.psrli.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_srli_epi32(__U, __A, 5); 
 }
 
 __m512i test_mm512_srli_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_srli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  // CHECK: @llvm.x86.avx512.psrli.q.512
   return _mm512_srli_epi64(__A, 5); 
 }
 
 __m512i test_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_srli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  // CHECK: @llvm.x86.avx512.psrli.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_srli_epi64(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_srli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  // CHECK: @llvm.x86.avx512.psrli.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_srli_epi64(__U, __A, 5); 
 }
 
@@ -2721,13 +2988,13 @@ __m512d test_mm512_maskz_movedup_pd(__mmask8 __U, __m512d __A) {
 int test_mm_comi_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_comi_round_sd
   // CHECK: @llvm.x86.avx512.vcomi.sd
-  return _mm_comi_round_sd(__A, __B, 5, 3); 
+  return _mm_comi_round_sd(__A, __B, 5, _MM_FROUND_NO_EXC); 
 }
 
 int test_mm_comi_round_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_comi_round_ss
   // CHECK: @llvm.x86.avx512.vcomi.ss
-  return _mm_comi_round_ss(__A, __B, 5, 3); 
+  return _mm_comi_round_ss(__A, __B, 5, _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_fixupimm_round_pd(__m512d __A, __m512d __B, __m512i __C) {
@@ -2934,11 +3201,13 @@ __m512d test_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m5
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_unpackhi_pd(__W, __U, __A, __B); 
 }
+#if __x86_64__
 unsigned long long test_mm_cvt_roundsd_si64(__m128d __A) {
   // CHECK-LABEL: @test_mm_cvt_roundsd_si64
   // CHECK: @llvm.x86.avx512.vcvtsd2si64
   return _mm_cvt_roundsd_si64(__A, _MM_FROUND_CUR_DIRECTION); 
 }
+#endif
 __m512i test_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.mask.vpermi2var.d.512
@@ -2956,11 +3225,13 @@ __m512d test_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_unpackhi_pd(__U, __A, __B); 
 }
+#if __x86_64__
 long long test_mm_cvt_roundsd_i64(__m128d __A) {
   // CHECK-LABEL: @test_mm_cvt_roundsd_i64
   // CHECK: @llvm.x86.avx512.vcvtsd2si64
   return _mm_cvt_roundsd_i64(__A, _MM_FROUND_CUR_DIRECTION); 
 }
+#endif
 __m512d test_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask2_permutex2var_pd
   // CHECK: @llvm.x86.avx512.mask.vpermi2var.pd.512
@@ -3038,6 +3309,7 @@ unsigned test_mm_cvtsd_u32(__m128d __A) {
   return _mm_cvtsd_u32(__A); 
 }
 
+#ifdef __x86_64__
 unsigned long long test_mm_cvt_roundsd_u64(__m128d __A) {
   // CHECK-LABEL: @test_mm_cvt_roundsd_u64
   // CHECK: @llvm.x86.avx512.vcvtsd2usi64
@@ -3049,6 +3321,7 @@ unsigned long long test_mm_cvtsd_u64(__m128d __A) {
   // CHECK: @llvm.x86.avx512.vcvtsd2usi64
   return _mm_cvtsd_u64(__A); 
 }
+#endif
 
 int test_mm_cvt_roundss_si32(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvt_roundss_si32
@@ -3062,6 +3335,7 @@ int test_mm_cvt_roundss_i32(__m128 __A) {
   return _mm_cvt_roundss_i32(__A, _MM_FROUND_CUR_DIRECTION);
 }
 
+#ifdef __x86_64__
 int test_mm_cvt_roundss_si64(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvt_roundss_si64
   // CHECK: @llvm.x86.avx512.vcvtss2si64
@@ -3073,6 +3347,7 @@ long long test_mm_cvt_roundss_i64(__m128 __A) {
   // CHECK: @llvm.x86.avx512.vcvtss2si64
   return _mm_cvt_roundss_i64(__A, _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 unsigned test_mm_cvt_roundss_u32(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvt_roundss_u32
@@ -3086,6 +3361,7 @@ unsigned test_mm_cvtss_u32(__m128 __A) {
   return _mm_cvtss_u32(__A); 
 }
 
+#ifdef __x86_64__
 unsigned long long test_mm_cvt_roundss_u64(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvt_roundss_u64
   // CHECK: @llvm.x86.avx512.vcvtss2usi64
@@ -3097,6 +3373,7 @@ unsigned long long test_mm_cvtss_u64(__m128 __A) {
   // CHECK: @llvm.x86.avx512.vcvtss2usi64
   return _mm_cvtss_u64(__A); 
 }
+#endif
 
 int test_mm_cvtt_roundsd_i32(__m128d __A) {
   // CHECK-LABEL: @test_mm_cvtt_roundsd_i32
@@ -3116,6 +3393,7 @@ int test_mm_cvttsd_i32(__m128d __A) {
   return _mm_cvttsd_i32(__A); 
 }
 
+#ifdef __x86_64__
 unsigned long long test_mm_cvtt_roundsd_si64(__m128d __A) {
   // CHECK-LABEL: @test_mm_cvtt_roundsd_si64
   // CHECK: @llvm.x86.avx512.cvttsd2si64
@@ -3133,6 +3411,7 @@ long long test_mm_cvttsd_i64(__m128d __A) {
   // CHECK: @llvm.x86.avx512.cvttsd2si64
   return _mm_cvttsd_i64(__A); 
 }
+#endif
 
 unsigned test_mm_cvtt_roundsd_u32(__m128d __A) {
   // CHECK-LABEL: @test_mm_cvtt_roundsd_u32
@@ -3146,6 +3425,7 @@ unsigned test_mm_cvttsd_u32(__m128d __A) {
   return _mm_cvttsd_u32(__A); 
 }
 
+#ifdef __x86_64__
 unsigned long long test_mm_cvtt_roundsd_u64(__m128d __A) {
   // CHECK-LABEL: @test_mm_cvtt_roundsd_u64
   // CHECK: @llvm.x86.avx512.cvttsd2usi64
@@ -3157,6 +3437,7 @@ unsigned long long test_mm_cvttsd_u64(__m128d __A) {
   // CHECK: @llvm.x86.avx512.cvttsd2usi64
   return _mm_cvttsd_u64(__A); 
 }
+#endif
 
 int test_mm_cvtt_roundss_i32(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvtt_roundss_i32
@@ -3176,6 +3457,7 @@ int test_mm_cvttss_i32(__m128 __A) {
   return _mm_cvttss_i32(__A); 
 }
 
+#ifdef __x86_64__
 float test_mm_cvtt_roundss_i64(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvtt_roundss_i64
   // CHECK: @llvm.x86.avx512.cvttss2si64
@@ -3193,6 +3475,7 @@ long long test_mm_cvttss_i64(__m128 __A) {
   // CHECK: @llvm.x86.avx512.cvttss2si64
   return _mm_cvttss_i64(__A); 
 }
+#endif
 
 unsigned test_mm_cvtt_roundss_u32(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvtt_roundss_u32
@@ -3206,6 +3489,7 @@ unsigned test_mm_cvttss_u32(__m128 __A) {
   return _mm_cvttss_u32(__A); 
 }
 
+#ifdef __x86_64__
 unsigned long long test_mm_cvtt_roundss_u64(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvtt_roundss_u64
   // CHECK: @llvm.x86.avx512.cvttss2usi64
@@ -3217,6 +3501,7 @@ unsigned long long test_mm_cvttss_u64(__m128 __A) {
   // CHECK: @llvm.x86.avx512.cvttss2usi64
   return _mm_cvttss_u64(__A); 
 }
+#endif
 
 __m512i test_mm512_cvtt_roundps_epu32(__m512 __A) 
 {
@@ -3469,37 +3754,41 @@ __m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) {
 
 __m512d test_mm512_permutevar_pd(__m512d __A, __m512i __C) {
   // CHECK-LABEL: @test_mm512_permutevar_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  // CHECK: @llvm.x86.avx512.vpermilvar.pd.512
   return _mm512_permutevar_pd(__A, __C); 
 }
 
 __m512d test_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) {
   // CHECK-LABEL: @test_mm512_mask_permutevar_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  // CHECK: @llvm.x86.avx512.vpermilvar.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_permutevar_pd(__W, __U, __A, __C); 
 }
 
 __m512d test_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) {
   // CHECK-LABEL: @test_mm512_maskz_permutevar_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  // CHECK: @llvm.x86.avx512.vpermilvar.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_permutevar_pd(__U, __A, __C); 
 }
 
 __m512 test_mm512_permutevar_ps(__m512 __A, __m512i __C) {
   // CHECK-LABEL: @test_mm512_permutevar_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  // CHECK: @llvm.x86.avx512.vpermilvar.ps.512
   return _mm512_permutevar_ps(__A, __C); 
 }
 
 __m512 test_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) {
   // CHECK-LABEL: @test_mm512_mask_permutevar_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  // CHECK: @llvm.x86.avx512.vpermilvar.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_permutevar_ps(__W, __U, __A, __C); 
 }
 
 __m512 test_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) {
   // CHECK-LABEL: @test_mm512_maskz_permutevar_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  // CHECK: @llvm.x86.avx512.vpermilvar.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_permutevar_ps(__U, __A, __C); 
 }
 
@@ -3880,253 +4169,281 @@ __m128 test_mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
 
 __m512i test_mm512_srai_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_srai_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  // CHECK: @llvm.x86.avx512.psrai.d.512
   return _mm512_srai_epi32(__A, 5); 
 }
 
 __m512i test_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_srai_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  // CHECK: @llvm.x86.avx512.psrai.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_srai_epi32(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_srai_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  // CHECK: @llvm.x86.avx512.psrai.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_srai_epi32(__U, __A, 5); 
 }
 
 __m512i test_mm512_srai_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  // CHECK: @llvm.x86.avx512.psrai.q.512
   return _mm512_srai_epi64(__A, 5); 
 }
 
 __m512i test_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  // CHECK: @llvm.x86.avx512.psrai.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_srai_epi64(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  // CHECK: @llvm.x86.avx512.psrai.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_srai_epi64(__U, __A, 5); 
 }
 
 __m512i test_mm512_sll_epi32(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_sll_epi32
-  // CHECK: @llvm.x86.avx512.mask.psll.d
+  // CHECK: @llvm.x86.avx512.psll.d.512
   return _mm512_sll_epi32(__A, __B); 
 }
 
 __m512i test_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_sll_epi32
-  // CHECK: @llvm.x86.avx512.mask.psll.d
+  // CHECK: @llvm.x86.avx512.psll.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_sll_epi32(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_sll_epi32
-  // CHECK: @llvm.x86.avx512.mask.psll.d
+  // CHECK: @llvm.x86.avx512.psll.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_sll_epi32(__U, __A, __B); 
 }
 
 __m512i test_mm512_sll_epi64(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_sll_epi64
-  // CHECK: @llvm.x86.avx512.mask.psll.q
+  // CHECK: @llvm.x86.avx512.psll.q.512
   return _mm512_sll_epi64(__A, __B); 
 }
 
 __m512i test_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_sll_epi64
-  // CHECK: @llvm.x86.avx512.mask.psll.q
+  // CHECK: @llvm.x86.avx512.psll.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_sll_epi64(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_sll_epi64
-  // CHECK: @llvm.x86.avx512.mask.psll.q
+  // CHECK: @llvm.x86.avx512.psll.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_sll_epi64(__U, __A, __B); 
 }
 
 __m512i test_mm512_sllv_epi32(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_sllv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  // CHECK: @llvm.x86.avx512.psllv.d.512
   return _mm512_sllv_epi32(__X, __Y); 
 }
 
 __m512i test_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_sllv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  // CHECK: @llvm.x86.avx512.psllv.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_sllv_epi32(__W, __U, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_sllv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  // CHECK: @llvm.x86.avx512.psllv.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_sllv_epi32(__U, __X, __Y); 
 }
 
 __m512i test_mm512_sllv_epi64(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_sllv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  // CHECK: @llvm.x86.avx512.psllv.q.512
   return _mm512_sllv_epi64(__X, __Y); 
 }
 
 __m512i test_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_sllv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  // CHECK: @llvm.x86.avx512.psllv.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_sllv_epi64(__W, __U, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_sllv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  // CHECK: @llvm.x86.avx512.psllv.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_sllv_epi64(__U, __X, __Y); 
 }
 
 __m512i test_mm512_sra_epi32(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_sra_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.d
+  // CHECK: @llvm.x86.avx512.psra.d.512
   return _mm512_sra_epi32(__A, __B); 
 }
 
 __m512i test_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_sra_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.d
+  // CHECK: @llvm.x86.avx512.psra.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_sra_epi32(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_sra_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.d
+  // CHECK: @llvm.x86.avx512.psra.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_sra_epi32(__U, __A, __B); 
 }
 
 __m512i test_mm512_sra_epi64(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q
+  // CHECK: @llvm.x86.avx512.psra.q.512
   return _mm512_sra_epi64(__A, __B); 
 }
 
 __m512i test_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q
+  // CHECK: @llvm.x86.avx512.psra.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_sra_epi64(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q
+  // CHECK: @llvm.x86.avx512.psra.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_sra_epi64(__U, __A, __B); 
 }
 
 __m512i test_mm512_srav_epi32(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_srav_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  // CHECK: @llvm.x86.avx512.psrav.d.512
   return _mm512_srav_epi32(__X, __Y); 
 }
 
 __m512i test_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_srav_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  // CHECK: @llvm.x86.avx512.psrav.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_srav_epi32(__W, __U, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_srav_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  // CHECK: @llvm.x86.avx512.psrav.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_srav_epi32(__U, __X, __Y); 
 }
 
 __m512i test_mm512_srav_epi64(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  // CHECK: @llvm.x86.avx512.psrav.q.512
   return _mm512_srav_epi64(__X, __Y); 
 }
 
 __m512i test_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  // CHECK: @llvm.x86.avx512.psrav.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_srav_epi64(__W, __U, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  // CHECK: @llvm.x86.avx512.psrav.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_srav_epi64(__U, __X, __Y); 
 }
 
 __m512i test_mm512_srl_epi32(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_srl_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  // CHECK: @llvm.x86.avx512.psrl.d.512
   return _mm512_srl_epi32(__A, __B); 
 }
 
 __m512i test_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_srl_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  // CHECK: @llvm.x86.avx512.psrl.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_srl_epi32(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_srl_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  // CHECK: @llvm.x86.avx512.psrl.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_srl_epi32(__U, __A, __B); 
 }
 
 __m512i test_mm512_srl_epi64(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_srl_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  // CHECK: @llvm.x86.avx512.psrl.q.512
   return _mm512_srl_epi64(__A, __B); 
 }
 
 __m512i test_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_srl_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  // CHECK: @llvm.x86.avx512.psrl.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_srl_epi64(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_srl_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  // CHECK: @llvm.x86.avx512.psrl.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_srl_epi64(__U, __A, __B); 
 }
 
 __m512i test_mm512_srlv_epi32(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_srlv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  // CHECK: @llvm.x86.avx512.psrlv.d.512
   return _mm512_srlv_epi32(__X, __Y); 
 }
 
 __m512i test_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_srlv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  // CHECK: @llvm.x86.avx512.psrlv.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_srlv_epi32(__W, __U, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_srlv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  // CHECK: @llvm.x86.avx512.psrlv.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_srlv_epi32(__U, __X, __Y); 
 }
 
 __m512i test_mm512_srlv_epi64(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_srlv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  // CHECK: @llvm.x86.avx512.psrlv.q.512
   return _mm512_srlv_epi64(__X, __Y); 
 }
 
 __m512i test_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_srlv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  // CHECK: @llvm.x86.avx512.psrlv.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_srlv_epi64(__W, __U, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_srlv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  // CHECK: @llvm.x86.avx512.psrlv.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_srlv_epi64(__U, __X, __Y); 
 }
 
@@ -4844,109 +5161,121 @@ void test_mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A)
 
 __m128i test_mm512_extracti32x4_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti32x4_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x4_epi32(__A, 3); 
 }
 
 __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti32x4_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); 
 }
 
 __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti32x4_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm512_maskz_extracti32x4_epi32(__U, __A, 3); 
 }
 
 __m256i test_mm512_extracti64x4_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti64x4_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extracti64x4_epi64(__A, 1); 
 }
 
 __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti64x4_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); 
 }
 
 __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti64x4_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm512_maskz_extracti64x4_epi64(__U, __A, 1); 
 }
 
 __m512d test_mm512_insertf64x4(__m512d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm512_insertf64x4
-  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm512_insertf64x4(__A, __B, 1);
 }
 
 __m512d test_mm512_mask_insertf64x4(__m512d __W, __mmask8 __U, __m512d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm512_mask_insertf64x4
-  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_insertf64x4(__W, __U, __A, __B, 1); 
 }
 
 __m512d test_mm512_maskz_insertf64x4(__mmask8 __U, __m512d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm512_maskz_insertf64x4
-  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_insertf64x4(__U, __A, __B, 1); 
 }
 
 __m512i test_mm512_inserti64x4(__m512i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm512_inserti64x4
-  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm512_inserti64x4(__A, __B, 1); 
 }
 
 __m512i test_mm512_mask_inserti64x4(__m512i __W, __mmask8 __U, __m512i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm512_mask_inserti64x4
-  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_inserti64x4(__W, __U, __A, __B, 1); 
 }
 
 __m512i test_mm512_maskz_inserti64x4(__mmask8 __U, __m512i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm512_maskz_inserti64x4
-  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_inserti64x4(__U, __A, __B, 1); 
 }
 
 __m512 test_mm512_insertf32x4(__m512 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm512_insertf32x4
-  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_insertf32x4(__A, __B, 1);
 }
 
 __m512 test_mm512_mask_insertf32x4(__m512 __W, __mmask16 __U, __m512 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm512_mask_insertf32x4
-  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_insertf32x4(__W, __U, __A, __B, 1); 
 }
 
 __m512 test_mm512_maskz_insertf32x4(__mmask16 __U, __m512 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm512_maskz_insertf32x4
-  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_insertf32x4(__U, __A, __B, 1); 
 }
 
 __m512i test_mm512_inserti32x4(__m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_inserti32x4
-  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_inserti32x4(__A, __B, 1); 
 }
 
 __m512i test_mm512_mask_inserti32x4(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_mask_inserti32x4
-  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_inserti32x4(__W, __U, __A, __B, 1); 
 }
 
 __m512i test_mm512_maskz_inserti32x4(__mmask16 __U, __m512i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm512_maskz_inserti32x4
-  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_inserti32x4(__U, __A, __B, 1); 
 }
 
@@ -5520,13 +5849,13 @@ __m128 test_mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128
 
 __m128 test_mm_mask3_fmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmsub_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmsub.ss
   return _mm_mask3_fmsub_ss(__W, __X, __Y, __U);
 }
 
 __m128 test_mm_mask3_fmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmsub_round_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmsub.ss
   return _mm_mask3_fmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
 }
 
@@ -5592,13 +5921,13 @@ __m128 test_mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m12
 
 __m128 test_mm_mask3_fnmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmsub_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ss
   return _mm_mask3_fnmsub_ss(__W, __X, __Y, __U);
 }
 
 __m128 test_mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmsub_round_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ss
   return _mm_mask3_fnmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
 }
 
@@ -5664,13 +5993,13 @@ __m128d test_mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m
 
 __m128d test_mm_mask3_fmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmsub_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmsub.sd
   return _mm_mask3_fmsub_sd(__W, __X, __Y, __U);
 }
 
 __m128d test_mm_mask3_fmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmsub_round_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmsub.sd
   return _mm_mask3_fmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
 }
 
@@ -5736,13 +6065,13 @@ __m128d test_mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __
 
 __m128d test_mm_mask3_fnmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmsub_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.sd
   return _mm_mask3_fnmsub_sd(__W, __X, __Y, __U);
 }
 
 __m128d test_mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.sd
   return _mm_mask3_fnmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
 }
 
@@ -6192,12 +6521,26 @@ __m512d test_mm512_cvtps_pd(__m256 __A) {
   return _mm512_cvtps_pd(__A); 
 }
 
+__m512d test_mm512_cvtpslo_pd(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_cvtpslo_pd
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_cvtpslo_pd(__A);
+}
+
 __m512d test_mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtps_pd
   // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
   return _mm512_mask_cvtps_pd(__W, __U, __A); 
 }
 
+__m512d test_mm512_mask_cvtpslo_pd(__m512d __W, __mmask8 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtpslo_pd
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_mask_cvtpslo_pd(__W, __U, __A);
+}
+
 __m512d test_mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtps_pd
   // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
@@ -6491,6 +6834,7 @@ __m128 test_mm_maskz_cvt_roundsd_ss(__mmask8 __U, __m128 __A, __m128d __B) {
   return _mm_maskz_cvt_roundsd_ss(__U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
 }
 
+#ifdef __x86_64__
 __m128d test_mm_cvt_roundi64_sd(__m128d __A, long long __B) {
   // CHECK-LABEL: @test_mm_cvt_roundi64_sd
   // CHECK: @llvm.x86.avx512.cvtsi2sd64
@@ -6502,6 +6846,7 @@ __m128d test_mm_cvt_roundsi64_sd(__m128d __A, long long __B) {
   // CHECK: @llvm.x86.avx512.cvtsi2sd64
   return _mm_cvt_roundsi64_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
 }
+#endif
 
 __m128 test_mm_cvt_roundsi32_ss(__m128 __A, int __B) {
   // CHECK-LABEL: @test_mm_cvt_roundsi32_ss
@@ -6515,6 +6860,7 @@ __m128 test_mm_cvt_roundi32_ss(__m128 __A, int __B) {
   return _mm_cvt_roundi32_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
 }
 
+#ifdef __x86_64__
 __m128 test_mm_cvt_roundsi64_ss(__m128 __A, long long __B) {
   // CHECK-LABEL: @test_mm_cvt_roundsi64_ss
   // CHECK: @llvm.x86.avx512.cvtsi2ss64
@@ -6526,6 +6872,7 @@ __m128 test_mm_cvt_roundi64_ss(__m128 __A, long long __B) {
   // CHECK: @llvm.x86.avx512.cvtsi2ss64
   return _mm_cvt_roundi64_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
 }
+#endif
 
 __m128d test_mm_cvt_roundss_sd(__m128d __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_cvt_roundss_sd
@@ -6551,6 +6898,7 @@ __m128d test_mm_cvtu32_sd(__m128d __A, unsigned __B) {
   return _mm_cvtu32_sd(__A, __B); 
 }
 
+#ifdef __x86_64__
 __m128d test_mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B) {
   // CHECK-LABEL: @test_mm_cvt_roundu64_sd
   // CHECK: @llvm.x86.avx512.cvtusi642sd
@@ -6562,6 +6910,7 @@ __m128d test_mm_cvtu64_sd(__m128d __A, unsigned long long __B) {
   // CHECK: @llvm.x86.avx512.cvtusi642sd
   return _mm_cvtu64_sd(__A, __B); 
 }
+#endif
 
 __m128 test_mm_cvt_roundu32_ss(__m128 __A, unsigned __B) {
   // CHECK-LABEL: @test_mm_cvt_roundu32_ss
@@ -6575,6 +6924,7 @@ __m128 test_mm_cvtu32_ss(__m128 __A, unsigned __B) {
   return _mm_cvtu32_ss(__A, __B); 
 }
 
+#ifdef __x86_64__
 __m128 test_mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B) {
   // CHECK-LABEL: @test_mm_cvt_roundu64_ss
   // CHECK: @llvm.x86.avx512.cvtusi642ss
@@ -6586,6 +6936,7 @@ __m128 test_mm_cvtu64_ss(__m128 __A, unsigned long long __B) {
   // CHECK: @llvm.x86.avx512.cvtusi642ss
   return _mm_cvtu64_ss(__A, __B); 
 }
+#endif
 
 __m512i test_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
@@ -6622,20 +6973,46 @@ __m512 test_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
   return _mm512_maskz_cvtepu32_ps (__U,__A);
 }
 
+__m512d test_mm512_cvtepi32_pd (__m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtepi32_pd
+  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
+  return _mm512_cvtepi32_pd (__A);
+}
+
 __m512d test_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32_pd 
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.512
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_pd
+  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_mask_cvtepi32_pd (__W,__U,__A);
 }
 
 __m512d test_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_pd 
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.512
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_pd
+  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_maskz_cvtepi32_pd (__U,__A);
 }
 
+__m512d test_mm512_cvtepi32lo_pd (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtepi32lo_pd
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
+  return _mm512_cvtepi32lo_pd (__A);
+}
+
+__m512d test_mm512_mask_cvtepi32lo_pd (__m512d __W, __mmask8 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32lo_pd
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
+  return _mm512_mask_cvtepi32lo_pd (__W, __U, __A);
+}
+
 __m512 test_mm512_cvtepi32_ps (__m512i __A)
 {
   // CHECK-LABEL: @test_mm512_cvtepi32_ps 
@@ -6657,20 +7034,46 @@ __m512 test_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
   return _mm512_maskz_cvtepi32_ps (__U,__A);
 }
 
+__m512d test_mm512_cvtepu32_pd(__m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtepu32_pd
+  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
+  return _mm512_cvtepu32_pd(__A);
+}
+
 __m512d test_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu32_pd 
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.512
+  // CHECK-LABEL: @test_mm512_mask_cvtepu32_pd
+  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_mask_cvtepu32_pd (__W,__U,__A);
 }
 
 __m512d test_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_pd 
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.512
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_pd
+  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_maskz_cvtepu32_pd (__U,__A);
 }
 
+__m512d test_mm512_cvtepu32lo_pd (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtepu32lo_pd
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
+  return _mm512_cvtepu32lo_pd (__A);
+}
+
+__m512d test_mm512_mask_cvtepu32lo_pd (__m512d __W, __mmask8 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepu32lo_pd
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
+  return _mm512_mask_cvtepu32lo_pd (__W, __U, __A);
+}
+
 __m256 test_mm512_cvtpd_ps (__m512d __A)
 {
   // CHECK-LABEL: @test_mm512_cvtpd_ps 
@@ -6685,6 +7088,23 @@ __m256 test_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
   return _mm512_mask_cvtpd_ps (__W,__U,__A);
 }
 
+__m512d test_mm512_cvtpd_pslo(__m512 __A) 
+{
+  // CHECK-LABEL: @test_mm512_cvtpd_pslo
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  // CHECK: zeroinitializer
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_cvtpd_pslo(__A);
+}
+
+__m512d test_mm512_mask_cvtpd_pslo(__m512 __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtpd_pslo
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  // CHECK: zeroinitializer
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_mask_cvtpd_pslo(__W, __U, __A);
+}
+
 __m256 test_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
 {
   // CHECK-LABEL: @test_mm512_maskz_cvtpd_ps 
@@ -7069,115 +7489,211 @@ __m512d test_mm512_roundscale_round_pd(__m512d __A)
   return _mm512_roundscale_round_pd(__A,3,_MM_FROUND_CUR_DIRECTION);
 }
 
+__m512i test_mm512_max_epi32 (__m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_max_epi32 
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  return _mm512_max_epi32 (__A,__B);
+}
+
 __m512i test_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_epi32 
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_max_epi32 (__W,__M,__A,__B);
 }
 
 __m512i test_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_epi32 
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_max_epi32 (__M,__A,__B);
 }
 
+__m512i test_mm512_max_epi64 (__m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_max_epi64 
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  return _mm512_max_epi64 (__A,__B);
+}
+
 __m512i test_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_epi64 
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_max_epi64 (__W,__M,__A,__B);
 }
 
 __m512i test_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_epi64 
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.512
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_max_epi64 (__M,__A,__B);
 }
 
+__m512i test_mm512_max_epu64 (__m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_max_epu64 
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  return _mm512_max_epu64 (__A,__B);
+}
+
 __m512i test_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_epu64 
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_max_epu64 (__W,__M,__A,__B);
 }
 
 __m512i test_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_epu64 
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_max_epu64 (__M,__A,__B);
 }
 
+__m512i test_mm512_max_epu32 (__m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_max_epu32 
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  return _mm512_max_epu32 (__A,__B);
+}
+
 __m512i test_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_epu32 
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_max_epu32 (__W,__M,__A,__B);
 }
 
 __m512i test_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_epu32 
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.512
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_max_epu32 (__M,__A,__B);
 }
 
+__m512i test_mm512_min_epi32 (__m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_min_epi32 
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  return _mm512_min_epi32 (__A,__B);
+}
+
 __m512i test_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_epi32 
-  // CHECK: @llvm.x86.avx512.mask.pmins.d.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_min_epi32 (__W,__M,__A,__B);
 }
 
 __m512i test_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_epi32 
-  // CHECK: @llvm.x86.avx512.mask.pmins.d.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_min_epi32 (__M,__A,__B);
 }
 
+__m512i test_mm512_min_epu32 (__m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_min_epu32 
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  return _mm512_min_epu32 (__A,__B);
+}
+
 __m512i test_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_epu32 
-  // CHECK: @llvm.x86.avx512.mask.pminu.d.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_min_epu32 (__W,__M,__A,__B);
 }
 
 __m512i test_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_epu32 
-  // CHECK: @llvm.x86.avx512.mask.pminu.d.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_min_epu32 (__M,__A,__B);
 }
 
+__m512i test_mm512_min_epi64 (__m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_min_epi64 
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  return _mm512_min_epi64 (__A,__B);
+}
+
 __m512i test_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_epi64 
-  // CHECK: @llvm.x86.avx512.mask.pmins.q.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_min_epi64 (__W,__M,__A,__B);
 }
 
 __m512i test_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_epi64 
-  // CHECK: @llvm.x86.avx512.mask.pmins.q.512
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_min_epi64 (__M,__A,__B);
 }
 
+__m512i test_mm512_min_epu64 (__m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_min_epu64 
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  return _mm512_min_epu64 (__A,__B);
+}
+
 __m512i test_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_epu64 
-  // CHECK: @llvm.x86.avx512.mask.pminu.q.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_min_epu64 (__W,__M,__A,__B);
 }
 
 __m512i test_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_epu64 
-  // CHECK: @llvm.x86.avx512.mask.pminu.q.512
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_min_epu64 (__M,__A,__B);
 }
 
@@ -7256,12 +7772,14 @@ __m512i test_mm512_setr_epi32 (int __A, int __B, int __C, int __D,
               __I, __J, __K, __L,__M, __N, __O, __P);
 }
 
+#ifdef __x86_64__
 __m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
 {
     //CHECK-LABEL: @test_mm512_mask_set1_epi64
     //CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.512
   return _mm512_mask_set1_epi64 (__O, __M, __A);
 }
+#endif
 
 __m512i test_mm512_set_epi64 (long long __A, long long __B, long long __C,
                               long long __D, long long __E, long long __F,
@@ -7443,11 +7961,13 @@ int test_mm_cvtss_i32(__m128 A) {
   return _mm_cvtss_i32(A);
 }
 
+#ifdef __x86_64__
 long long test_mm_cvtss_i64(__m128 A) {
   // CHECK-LABEL: test_mm_cvtss_i64
   // CHECK: call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %{{.*}})
   return _mm_cvtss_i64(A);
 }
+#endif
 
 __m128d test_mm_cvti32_sd(__m128d A, int B) {
   // CHECK-LABEL: test_mm_cvti32_sd
@@ -7456,12 +7976,14 @@ __m128d test_mm_cvti32_sd(__m128d A, int B) {
   return _mm_cvti32_sd(A, B);
 }
 
+#ifdef __x86_64__
 __m128d test_mm_cvti64_sd(__m128d A, long long B) {
   // CHECK-LABEL: test_mm_cvti64_sd
   // CHECK: sitofp i64 %{{.*}} to double
   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_cvti64_sd(A, B);
 }
+#endif
 
 __m128 test_mm_cvti32_ss(__m128 A, int B) {
   // CHECK-LABEL: test_mm_cvti32_ss
@@ -7470,12 +7992,14 @@ __m128 test_mm_cvti32_ss(__m128 A, int B) {
   return _mm_cvti32_ss(A, B);
 }
 
+#ifdef __x86_64__
 __m128 test_mm_cvti64_ss(__m128 A, long long B) {
   // CHECK-LABEL: test_mm_cvti64_ss
   // CHECK: sitofp i64 %{{.*}} to float
   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_cvti64_ss(A, B);
 }
+#endif
 
 int test_mm_cvtsd_i32(__m128d A) {
   // CHECK-LABEL: test_mm_cvtsd_i32
@@ -7483,11 +8007,13 @@ int test_mm_cvtsd_i32(__m128d A) {
   return _mm_cvtsd_i32(A);
 }
 
+#ifdef __x86_64__
 long long test_mm_cvtsd_i64(__m128d A) {
   // CHECK-LABEL: test_mm_cvtsd_i64
   // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %{{.*}})
   return _mm_cvtsd_i64(A);
 }
+#endif
 
 __m128d test_mm_mask_cvtss_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_cvtss_sd
@@ -7549,6 +8075,143 @@ __m512d test_mm512_setzero_pd()
   return _mm512_setzero_pd();
 }
 
+__mmask16 test_mm512_int2mask(int __a)
+{
+  // O2-LABEL: test_mm512_int2mask
+  // O2: trunc i32 %__a to i16
+  return _mm512_int2mask(__a);
+}
+
+int test_mm512_mask2int(__mmask16 __a)
+{
+  // O2-LABEL: test_mm512_mask2int
+  // O2: zext i16 %__a to i32
+  return _mm512_mask2int(__a);
+}
+
+__m128 test_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  // O2-LABEL: @test_mm_mask_move_ss
+  // O2: %[[M:.*]] = and i8 %__U, 1
+  // O2: %[[M2:.*]] = icmp ne i8 %[[M]], 0
+  // O2: %[[ELM1:.*]] = extractelement <4 x float> %__B, i32 0
+  // O2: %[[ELM2:.*]] = extractelement <4 x float> %__W, i32 0
+  // O2: %[[SEL:.*]] = select i1 %[[M2]], float %[[ELM1]], float %[[ELM2]]
+  // O2: %[[RES:.*]] = insertelement <4 x float> %__A, float %[[SEL]], i32 0
+  // O2: ret <4 x float> %[[RES]]
+  return _mm_mask_move_ss ( __W,  __U,  __A,  __B);
+}
+
+__m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  // O2-LABEL: @test_mm_maskz_move_ss
+  // O2: %[[M:.*]] = and i8 %__U, 1
+  // O2: %[[M2:.*]] = icmp ne i8 %[[M]], 0
+  // O2: %[[ELM1:.*]] = extractelement <4 x float> %__B, i32 0
+  // O2: %[[SEL:.*]] = select i1 %[[M2]], float %[[ELM1]], float 0.0 
+  // O2: %[[RES:.*]] = insertelement <4 x float> %__A, float %[[SEL]], i32 0
+  // O2: ret <4 x float> %[[RES]]
+  return _mm_maskz_move_ss (__U, __A, __B);
+}
+
+__m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  // O2-LABEL: @test_mm_mask_move_sd
+  // O2: %[[M:.*]] = and i8 %__U, 1
+  // O2: %[[M2:.*]] = icmp ne i8 %[[M]], 0
+  // O2: %[[ELM1:.*]] = extractelement <2 x double> %__B, i32 0
+  // O2: %[[ELM2:.*]] = extractelement <2 x double> %__W, i32 0
+  // O2: %[[SEL:.*]] = select i1 %[[M2]], double %[[ELM1]], double %[[ELM2]]
+  // O2: %[[RES:.*]] = insertelement <2 x double> %__A, double %[[SEL]], i32 0
+  // O2: ret <2 x double> %[[RES]]
+  return _mm_mask_move_sd ( __W,  __U,  __A,  __B);
+}
+
+__m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  // O2-LABEL: @test_mm_maskz_move_sd
+  // O2: %[[M:.*]] = and i8 %__U, 1
+  // O2: %[[M2:.*]] = icmp ne i8 %[[M]], 0
+  // O2: %[[ELM1:.*]] = extractelement <2 x double> %__B, i32 0
+  // O2: %[[SEL:.*]] = select i1 %[[M2]], double %[[ELM1]], double 0.0
+  // O2: %[[RES:.*]] = insertelement <2 x double> %__A, double %[[SEL]], i32 0
+  // O2: ret <2 x double> %[[RES]]
+  return _mm_maskz_move_sd (__U, __A, __B);
+}
+
+void test_mm_mask_store_ss(float * __P, __mmask8 __U, __m128 __A)
+{
+  // O2-LABEL: @test_mm_mask_store_ss
+  // O2: %[[CAST:.*]] = bitcast float* %__P to <16 x float>*
+  // O2: %[[SHUFFLE:.*]] = shufflevector <4 x float> %__A, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // O2: %[[MASK1:.*]] = and i8 %__U, 1
+  // O2: %[[MASK2:.*]] = zext i8 %[[MASK1]] to i16
+  // O2: %[[MASK3:.*]] = bitcast i16 %[[MASK2]] to <16 x i1>
+  // O2: tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %[[SHUFFLE]], <16 x float>* %[[CAST]], i32 16, <16 x i1> %[[MASK3]])
+  _mm_mask_store_ss(__P, __U, __A);
+}
+
+void test_mm_mask_store_sd(double * __P, __mmask8 __U, __m128d __A)
+{
+  // O2-LABEL: @test_mm_mask_store_sd
+  // O2: %[[CAST:.*]] = bitcast double* %__P to <8 x double>*
+  // O2: %[[SHUFFLE:.*]] = shufflevector <2 x double> %__A, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // O2: %[[MASK1:.*]] = and i8 %__U, 1
+  // O2: %[[MASK2:.*]] = bitcast i8 %[[MASK1]] to <8 x i1>
+  // O2: tail call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %[[SHUFFLE]], <8 x double>* %[[CAST]], i32 16, <8 x i1> %[[MASK2]])
+  _mm_mask_store_sd(__P, __U, __A);
+}
+
+__m128 test_mm_mask_load_ss(__m128 __A, __mmask8 __U, const float* __W)
+{
+  // O2-LABEL: @test_mm_mask_load_ss
+  // O2: %[[SHUF:.*]] = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
+  // O2: %[[PTR:.*]] = bitcast float* %__W to <16 x float>*
+  // O2: %[[SHUF2:.*]] = shufflevector <4 x float> %[[SHUF]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // O2: %[[AND:.*]] = and i8 %__U, 1
+  // O2: %[[MASK:.*]] = zext i8 %[[AND]] to i16
+  // O2: %[[MASK2:.*]] = bitcast i16 %[[MASK]] to <16 x i1>
+  // O2: %[[RES:.*]] = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %[[PTR]], i32 16, <16 x i1> %[[MASK2]], <16 x float> %[[SHUF2]]) 
+  // O2: shufflevector <16 x float> %[[RES]], <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm_mask_load_ss(__A, __U, __W);
+}
+
+__m128 test_mm_maskz_load_ss (__mmask8 __U, const float * __W)
+{
+  // O2-LABEL: @test_mm_maskz_load_ss
+  // O2: %[[PTR:.*]] = bitcast float* %__W to <16 x float>*
+  // O2: %[[AND:.*]] = and i8 %__U, 1
+  // O2: %[[MASK:.*]] = zext i8 %[[AND]] to i16
+  // O2: %[[MASK2:.*]] = bitcast i16 %[[MASK]] to <16 x i1>
+  // O2: %[[RES:.*]] = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %[[PTR]], i32 16, <16 x i1> %[[MASK2]], <16 x float> zeroinitializer) 
+  // O2: shufflevector <16 x float> %[[RES]], <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm_maskz_load_ss (__U, __W);
+}
+
+__m128d test_mm_mask_load_sd (__m128d __A, __mmask8 __U, const double * __W)
+{
+  // O2-LABEL: @test_mm_mask_load_sd
+  // O2: %[[SHUF:.*]] = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
+  // O2: %[[PTR:.*]] = bitcast double* %__W to <8 x double>*
+  // O2: %[[SHUF2:.*]] = shufflevector <2 x double> %[[SHUF]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // O2: %[[AND:.*]] = and i8 %__U, 1
+  // O2: %[[MASK:.*]] = bitcast i8 %[[AND]] to <8 x i1>
+  // O2: %[[RES:.*]] = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %[[PTR]], i32 16, <8 x i1> %[[MASK]], <8 x double> %[[SHUF2]]) 
+  // O2: shufflevector <8 x double> %[[RES]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  return _mm_mask_load_sd (__A, __U, __W);
+}
+
+__m128d test_mm_maskz_load_sd (__mmask8 __U, const double * __W)
+{
+  // O2-LABEL: @test_mm_maskz_load_sd
+  // O2: %[[PTR:.*]] = bitcast double* %__W to <8 x double>*
+  // O2: %[[AND:.*]] = and i8 %__U, 1
+  // O2: %[[MASK:.*]] = bitcast i8 %[[AND]] to <8 x i1>
+  // O2: %[[RES:.*]] = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %[[PTR]], i32 16, <8 x i1> %[[MASK]], <8 x double> zeroinitializer) 
+  // O2: shufflevector <8 x double> %[[RES]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  return _mm_maskz_load_sd (__U, __W);
+}
+
 __m512d test_mm512_abs_pd(__m512d a){
   // CHECK-LABEL: @test_mm512_abs_pd
   // CHECK: and <8 x i64> 
diff --git a/test/CodeGen/avx512ifma-builtins.c b/test/CodeGen/avx512ifma-builtins.c
index 366d3bec319b..311d6989bf0d 100644
--- a/test/CodeGen/avx512ifma-builtins.c
+++ b/test/CodeGen/avx512ifma-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/avx512ifmavl-builtins.c b/test/CodeGen/avx512ifmavl-builtins.c
index b38c1a8953ad..c59af0ec6d06 100644
--- a/test/CodeGen/avx512ifmavl-builtins.c
+++ b/test/CodeGen/avx512ifmavl-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #define __MM_MALLOC_H
 
diff --git a/test/CodeGen/avx512pf-builtins.c b/test/CodeGen/avx512pf-builtins.c
index 16b27e915136..19ee083eae2d 100644
--- a/test/CodeGen/avx512pf-builtins.c
+++ b/test/CodeGen/avx512pf-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512pf -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512pf -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/avx512vbmi-builtins.c b/test/CodeGen/avx512vbmi-builtins.c
index 3fe97fd536b9..0816bce3a6de 100644
--- a/test/CodeGen/avx512vbmi-builtins.c
+++ b/test/CodeGen/avx512vbmi-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/avx512vbmivl-builtin.c b/test/CodeGen/avx512vbmivl-builtin.c
index 549c9656c0de..b114720758aa 100644
--- a/test/CodeGen/avx512vbmivl-builtin.c
+++ b/test/CodeGen/avx512vbmivl-builtin.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/avx512vl-builtins.c b/test/CodeGen/avx512vl-builtins.c
index 2b6ec3733830..fe4ebe12ddd4 100644
--- a/test/CodeGen/avx512vl-builtins.c
+++ b/test/CodeGen/avx512vl-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
@@ -501,56 +499,56 @@ __mmask8 test_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __
   return (__mmask8)_mm256_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
-__mmask8 test_mm_cmp_epi32_mask(__m128i __a, __m128i __b) {
-  // CHECK-LABEL: @test_mm_cmp_epi32_mask
+__mmask8 test_mm_cmp_eq_epi32_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmp_eq_epi32_mask
   // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm_cmp_epi32_mask(__a, __b, 0);
+  return (__mmask8)_mm_cmp_epi32_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm_mask_cmp_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  // CHECK-LABEL: @test_mm_mask_cmp_epi32_mask
-  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+__mmask8 test_mm_mask_cmp_lt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmp_lt_epi32_mask
+  // CHECK: icmp slt <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm_mask_cmp_epi32_mask(__u, __a, __b, 0);
+  return (__mmask8)_mm_mask_cmp_epi32_mask(__u, __a, __b, _MM_CMPINT_LT);
 }
 
-__mmask8 test_mm_cmp_epi64_mask(__m128i __a, __m128i __b) {
-  // CHECK-LABEL: @test_mm_cmp_epi64_mask
-  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm_cmp_epi64_mask(__a, __b, 0);
+__mmask8 test_mm_cmp_lt_epi64_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmp_lt_epi64_mask
+  // CHECK: icmp slt <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epi64_mask(__a, __b, _MM_CMPINT_LT);
 }
 
-__mmask8 test_mm_mask_cmp_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  // CHECK-LABEL: @test_mm_mask_cmp_epi64_mask
+__mmask8 test_mm_mask_cmp_eq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmp_eq_epi64_mask
   // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm_mask_cmp_epi64_mask(__u, __a, __b, 0);
+  return (__mmask8)_mm_mask_cmp_epi64_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm256_cmp_epi32_mask(__m256i __a, __m256i __b) {
-  // CHECK-LABEL: @test_mm256_cmp_epi32_mask
+__mmask8 test_mm256_cmp_eq_epi32_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmp_eq_epi32_mask
   // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm256_cmp_epi32_mask(__a, __b, 0);
+  return (__mmask8)_mm256_cmp_epi32_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm256_mask_cmp_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  // CHECK-LABEL: @test_mm256_mask_cmp_epi32_mask
-  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+__mmask8 test_mm256_mask_cmp_le_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmp_le_epi32_mask
+  // CHECK: icmp sle <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm256_mask_cmp_epi32_mask(__u, __a, __b, 0);
+  return (__mmask8)_mm256_mask_cmp_epi32_mask(__u, __a, __b, _MM_CMPINT_LE);
 }
 
-__mmask8 test_mm256_cmp_epi64_mask(__m256i __a, __m256i __b) {
-  // CHECK-LABEL: @test_mm256_cmp_epi64_mask
+__mmask8 test_mm256_cmp_eq_epi64_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmp_eq_epi64_mask
   // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm256_cmp_epi64_mask(__a, __b, 0);
+  return (__mmask8)_mm256_cmp_epi64_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm256_mask_cmp_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  // CHECK-LABEL: @test_mm256_mask_cmp_epi64_mask
+__mmask8 test_mm256_mask_cmp_eq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmp_eq_epi64_mask
   // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
-  return (__mmask8)_mm256_mask_cmp_epi64_mask(__u, __a, __b, 0);
+  return (__mmask8)_mm256_mask_cmp_epi64_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
 __mmask8 test_mm_cmp_epu32_mask(__m128i __a, __m128i __b) {
@@ -608,118 +606,136 @@ __mmask8 test_mm256_mask_cmp_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b)
 __m256i test_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_add_epi32
-  //CHECK: @llvm.x86.avx512.mask.padd.d.256
+  //CHECK: add <8 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_add_epi32(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_add_epi32
-  //CHECK: @llvm.x86.avx512.mask.padd.d.256
+  //CHECK: add <8 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_add_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_add_epi64
-  //CHECK: @llvm.x86.avx512.mask.padd.q.256
+  //CHECK: add <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_add_epi64(__W,__U,__A,__B);
 }
 
 __m256i test_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_add_epi64
-  //CHECK: @llvm.x86.avx512.mask.padd.q.256
+  //CHECK: add <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_add_epi64 (__U,__A,__B);
 }
 
 __m256i test_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_sub_epi32
-  //CHECK: @llvm.x86.avx512.mask.psub.d.256
+  //CHECK: sub <8 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_sub_epi32 (__W,__U,__A,__B);
 }
 
 __m256i test_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_sub_epi32
-  //CHECK: @llvm.x86.avx512.mask.psub.d.256
+  //CHECK: sub <8 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_sub_epi32 (__U,__A,__B);
 }
 
 __m256i test_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_sub_epi64
-  //CHECK: @llvm.x86.avx512.mask.psub.q.256
+  //CHECK: sub <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_sub_epi64 (__W,__U,__A,__B);
 }
 
 __m256i test_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_sub_epi64
-  //CHECK: @llvm.x86.avx512.mask.psub.q.256
+  //CHECK: sub <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_sub_epi64 (__U,__A,__B);
 }
 
 __m128i test_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_add_epi32
-  //CHECK: @llvm.x86.avx512.mask.padd.d.128
+  //CHECK: add <4 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_add_epi32(__W,__U,__A,__B);
 }
 
 
 __m128i test_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_add_epi32
-  //CHECK: @llvm.x86.avx512.mask.padd.d.128
+  //CHECK: add <4 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_add_epi32 (__U,__A,__B);
 }
 
 __m128i test_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B) {
-//CHECK-LABEL: @test_mm_mask_add_epi64
-  //CHECK: @llvm.x86.avx512.mask.padd.q.128
+  //CHECK-LABEL: @test_mm_mask_add_epi64
+  //CHECK: add <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_add_epi64 (__W,__U,__A,__B);
 }
 
 __m128i test_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_add_epi64
-  //CHECK: @llvm.x86.avx512.mask.padd.q.128
+  //CHECK: add <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_add_epi64 (__U,__A,__B);
 }
 
 __m128i test_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_sub_epi32
-  //CHECK: @llvm.x86.avx512.mask.psub.d.128
+  //CHECK: sub <4 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_sub_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_sub_epi32
-  //CHECK: @llvm.x86.avx512.mask.psub.d.128
+  //CHECK: sub <4 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_sub_epi32(__U, __A, __B);
 }
 
 __m128i test_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_sub_epi64
-  //CHECK: @llvm.x86.avx512.mask.psub.q.128
+  //CHECK: sub <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_sub_epi64 (__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_sub_epi64
-  //CHECK: @llvm.x86.avx512.mask.psub.q.128
+  //CHECK: sub <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_sub_epi64 (__U, __A, __B);
 }
 
 __m256i test_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
            __m256i __Y) {
   //CHECK-LABEL: @test_mm256_mask_mul_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmul.dq.256
+  //CHECK: @llvm.x86.avx2.pmul.dq
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_mul_epi32(__W, __M, __X, __Y);
 }
 
 __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) {
   //CHECK-LABEL: @test_mm256_maskz_mul_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmul.dq.256
+  //CHECK: @llvm.x86.avx2.pmul.dq
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_mul_epi32(__M, __X, __Y);
 }
 
@@ -727,65 +743,75 @@ __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) {
 __m128i test_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
         __m128i __Y) {
   //CHECK-LABEL: @test_mm_mask_mul_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmul.dq.128
+  //CHECK: @llvm.x86.sse41.pmuldq
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_mul_epi32(__W, __M, __X, __Y);
 }
 
 __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) {
   //CHECK-LABEL: @test_mm_maskz_mul_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmul.dq.128
+  //CHECK: @llvm.x86.sse41.pmuldq
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_mul_epi32(__M, __X, __Y);
 }
 
 __m256i test_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
            __m256i __Y) {
   //CHECK-LABEL: @test_mm256_mask_mul_epu32
-  //CHECK: @llvm.x86.avx512.mask.pmulu.dq.256
+  //CHECK: @llvm.x86.avx2.pmulu.dq
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_mul_epu32(__W, __M, __X, __Y);
 }
 
 __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) {
   //CHECK-LABEL: @test_mm256_maskz_mul_epu32
-  //CHECK: @llvm.x86.avx512.mask.pmulu.dq.256
+  //CHECK: @llvm.x86.avx2.pmulu.dq
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_mul_epu32(__M, __X, __Y);
 }
 
 __m128i test_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
         __m128i __Y) {
   //CHECK-LABEL: @test_mm_mask_mul_epu32
-  //CHECK: @llvm.x86.avx512.mask.pmulu.dq.128
+  //CHECK: @llvm.x86.sse2.pmulu.dq
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_mul_epu32(__W, __M, __X, __Y);
 }
 
 __m128i test_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) {
   //CHECK-LABEL: @test_mm_maskz_mul_epu32
-  //CHECK: @llvm.x86.avx512.mask.pmulu.dq.128
+  //CHECK: @llvm.x86.sse2.pmulu.dq
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_mul_epu32(__M, __X, __Y);
 }
 
 __m128i test_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_mullo_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmull.d.128
+  //CHECK: mul <4 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_mullo_epi32(__M, __A, __B);
 }
 
 __m128i test_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
           __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_mullo_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmull.d.128
+  //CHECK: mul <4 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_mullo_epi32(__W, __M, __A, __B);
 }
 
 __m256i test_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_mullo_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmull.d.256
+  //CHECK: mul <8 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_mullo_epi32(__M, __A, __B);
 }
 
 __m256i test_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
        __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_mullo_epi32
-  //CHECK: @llvm.x86.avx512.mask.pmull.d.256
+  //CHECK: mul <8 x i32> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_mullo_epi32(__W, __M, __A, __B);
 }
 
@@ -1503,42 +1529,50 @@ __m256 test_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 _
 
 __m128d test_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_add_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.128
+  // CHECK: fadd <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_add_pd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_add_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.128
+  // CHECK: fadd <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_add_pd(__U,__A,__B); 
 }
 __m256d test_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_add_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.256
+  // CHECK: fadd <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_add_pd(__W,__U,__A,__B); 
 }
 __m256d test_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_add_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.256
+  // CHECK: fadd <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_add_pd(__U,__A,__B); 
 }
-__m128 test_mm_mask_add_ps(__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
+__m128 test_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_add_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.128
+  // CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_add_ps(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_add_ps(__mmask16 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_add_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.128
+  // CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_add_ps(__U,__A,__B); 
 }
-__m256 test_mm256_mask_add_ps(__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
+__m256 test_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_add_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.256
+  // CHECK: fadd <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_add_ps(__W,__U,__A,__B); 
 }
-__m256 test_mm256_maskz_add_ps(__mmask16 __U, __m256 __A, __m256 __B) {
+__m256 test_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_add_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.256
+  // CHECK: fadd <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_add_ps(__U,__A,__B); 
 }
 __m128i test_mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) {
@@ -1703,23 +1737,29 @@ void test_mm256_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m256i __A)
 }
 __m128d test_mm_mask_cvtepi32_pd(__m128d __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.128
-  return _mm_mask_cvtepi32_pd(__W,__U,__A); 
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: sitofp <2 x i32> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> {{.*}}, <2 x double> {{.*}}, <2 x double> {{.*}}
+  return _mm_mask_cvtepi32_pd(__W,__U,__A);
 }
 __m128d test_mm_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.128
-  return _mm_maskz_cvtepi32_pd(__U,__A); 
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: sitofp <2 x i32> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> {{.*}}, <2 x double> {{.*}}, <2 x double> {{.*}}
+  return _mm_maskz_cvtepi32_pd(__U,__A);
 }
 __m256d test_mm256_mask_cvtepi32_pd(__m256d __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.256
-  return _mm256_mask_cvtepi32_pd(__W,__U,__A); 
+  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> {{.*}}, <4 x double> {{.*}}, <4 x double> {{.*}}
+  return _mm256_mask_cvtepi32_pd(__W,__U,__A);
 }
 __m256d test_mm256_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.256
-  return _mm256_maskz_cvtepi32_pd(__U,__A); 
+  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> {{.*}}, <4 x double> {{.*}}, <4 x double> {{.*}}
+  return _mm256_maskz_cvtepi32_pd(__U,__A);
 }
 __m128 test_mm_mask_cvtepi32_ps(__m128 __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi32_ps
@@ -1983,33 +2023,40 @@ __m256i test_mm256_maskz_cvttps_epu32(__mmask8 __U, __m256 __A) {
 }
 __m128d test_mm_cvtepu32_pd(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepu32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.128
-  return _mm_cvtepu32_pd(__A); 
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: uitofp <2 x i32> %{{.*}} to <2 x double>
+  return _mm_cvtepu32_pd(__A);
 }
 __m128d test_mm_mask_cvtepu32_pd(__m128d __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.128
-  return _mm_mask_cvtepu32_pd(__W,__U,__A); 
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: uitofp <2 x i32> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> {{.*}}, <2 x double> {{.*}}, <2 x double> {{.*}}
+  return _mm_mask_cvtepu32_pd(__W,__U,__A);
 }
 __m128d test_mm_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.128
-  return _mm_maskz_cvtepu32_pd(__U,__A); 
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: uitofp <2 x i32> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> {{.*}}, <2 x double> {{.*}}, <2 x double> {{.*}}
+  return _mm_maskz_cvtepu32_pd(__U,__A);
 }
 __m256d test_mm256_cvtepu32_pd(__m128i __A) {
   // CHECK-LABEL: @test_mm256_cvtepu32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.256
-  return _mm256_cvtepu32_pd(__A); 
+  // CHECK: uitofp <4 x i32> %{{.*}} to <4 x double>
+  return _mm256_cvtepu32_pd(__A);
 }
 __m256d test_mm256_mask_cvtepu32_pd(__m256d __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.256
-  return _mm256_mask_cvtepu32_pd(__W,__U,__A); 
+  // CHECK: uitofp <4 x i32> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> {{.*}}, <4 x double> {{.*}}, <4 x double> {{.*}}
+  return _mm256_mask_cvtepu32_pd(__W,__U,__A);
 }
 __m256d test_mm256_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu32_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.256
-  return _mm256_maskz_cvtepu32_pd(__U,__A); 
+  // CHECK: uitofp <4 x i32> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> {{.*}}, <4 x double> {{.*}}, <4 x double> {{.*}}
+  return _mm256_maskz_cvtepu32_pd(__U,__A);
 }
 __m128 test_mm_cvtepu32_ps(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepu32_ps
@@ -2043,42 +2090,50 @@ __m256 test_mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) {
 }
 __m128d test_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_div_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.128
+  // CHECK: fdiv <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_div_pd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_div_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.128
+  // CHECK: fdiv <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_div_pd(__U,__A,__B); 
 }
 __m256d test_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_div_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.256
+  // CHECK: fdiv <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_div_pd(__W,__U,__A,__B); 
 }
 __m256d test_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_div_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.256
+  // CHECK: fdiv <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_div_pd(__U,__A,__B); 
 }
 __m128 test_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_div_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.128
+  // CHECK: fdiv <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_div_ps(__W,__U,__A,__B); 
 }
-__m128 test_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+__m128 test_mm_maskz_div_ps(__mmask16 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_div_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.128
+  // CHECK: fdiv <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_div_ps(__U,__A,__B); 
 }
 __m256 test_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_div_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.256
+  // CHECK: fdiv <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_div_ps(__W,__U,__A,__B); 
 }
 __m256 test_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_div_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.256
+  // CHECK: fdiv <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_div_ps(__U,__A,__B); 
 }
 __m128d test_mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) {
@@ -2303,142 +2358,170 @@ __m256 test_mm256_maskz_getexp_ps(__mmask8 __U, __m256 __A) {
 }
 __m128d test_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_max_pd
-  // CHECK: @llvm.x86.avx512.mask.max.pd
+  // CHECK: @llvm.x86.sse2.max.pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_max_pd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_max_pd
-  // CHECK: @llvm.x86.avx512.mask.max.pd
+  // CHECK: @llvm.x86.sse2.max.pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_max_pd(__U,__A,__B); 
 }
 __m256d test_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_max_pd
-  // CHECK: @llvm.x86.avx512.mask.max.pd.256
+  // CHECK: @llvm.x86.avx.max.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_max_pd(__W,__U,__A,__B); 
 }
 __m256d test_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_pd
-  // CHECK: @llvm.x86.avx512.mask.max.pd.256
+  // CHECK: @llvm.x86.avx.max.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_max_pd(__U,__A,__B); 
 }
 __m128 test_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_max_ps
-  // CHECK: @llvm.x86.avx512.mask.max.ps
+  // CHECK: @llvm.x86.sse.max.ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_max_ps(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_max_ps
-  // CHECK: @llvm.x86.avx512.mask.max.ps
+  // CHECK: @llvm.x86.sse.max.ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_max_ps(__U,__A,__B); 
 }
 __m256 test_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_max_ps
-  // CHECK: @llvm.x86.avx512.mask.max.ps.256
+  // CHECK: @llvm.x86.avx.max.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_max_ps(__W,__U,__A,__B); 
 }
 __m256 test_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_ps
-  // CHECK: @llvm.x86.avx512.mask.max.ps.256
+  // CHECK: @llvm.x86.avx.max.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_max_ps(__U,__A,__B); 
 }
 __m128d test_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_min_pd
-  // CHECK: @llvm.x86.avx512.mask.min.pd
+  // CHECK: @llvm.x86.sse2.min.pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_min_pd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_min_pd
-  // CHECK: @llvm.x86.avx512.mask.min.pd
+  // CHECK: @llvm.x86.sse2.min.pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_min_pd(__U,__A,__B); 
 }
 __m256d test_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_min_pd
-  // CHECK: @llvm.x86.avx512.mask.min.pd.256
+  // CHECK: @llvm.x86.avx.min.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_min_pd(__W,__U,__A,__B); 
 }
 __m256d test_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_pd
-  // CHECK: @llvm.x86.avx512.mask.min.pd.256
+  // CHECK: @llvm.x86.avx.min.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_min_pd(__U,__A,__B); 
 }
 __m128 test_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_min_ps
-  // CHECK: @llvm.x86.avx512.mask.min.ps
+  // CHECK: @llvm.x86.sse.min.ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_min_ps(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_min_ps
-  // CHECK: @llvm.x86.avx512.mask.min.ps
+  // CHECK: @llvm.x86.sse.min.ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_min_ps(__U,__A,__B); 
 }
 __m256 test_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_min_ps
-  // CHECK: @llvm.x86.avx512.mask.min.ps.256
+  // CHECK: @llvm.x86.avx.min.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_min_ps(__W,__U,__A,__B); 
 }
 __m256 test_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_ps
-  // CHECK: @llvm.x86.avx512.mask.min.ps.256
+  // CHECK: @llvm.x86.avx.min.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_min_ps(__U,__A,__B); 
 }
 __m128d test_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_mul_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd
+  // CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_mul_pd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_mul_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd
+  // CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_mul_pd(__U,__A,__B); 
 }
 __m256d test_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_mul_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd.256
+  // CHECK: fmul <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_mul_pd(__W,__U,__A,__B); 
 }
 __m256d test_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_mul_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd.256
+  // CHECK: fmul <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_mul_pd(__U,__A,__B); 
 }
 __m128 test_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_mul_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps
+  // CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_mul_ps(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_mul_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps
+  // CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_mul_ps(__U,__A,__B); 
 }
 __m256 test_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_mul_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps.256
+  // CHECK: fmul <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_mul_ps(__W,__U,__A,__B); 
 }
 __m256 test_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_mul_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps.256
+  // CHECK: fmul <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_mul_ps(__U,__A,__B); 
 }
 __m128i test_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_abs_epi32
-  // CHECK: @llvm.x86.avx512.mask.pabs.d.128
+  // CHECK: @llvm.x86.ssse3.pabs.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_abs_epi32(__W,__U,__A); 
 }
 __m128i test_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_abs_epi32
-  // CHECK: @llvm.x86.avx512.mask.pabs.d.128
+  // CHECK: @llvm.x86.ssse3.pabs.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_abs_epi32(__U,__A); 
 }
 __m256i test_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_abs_epi32
-  // CHECK: @llvm.x86.avx512.mask.pabs.d.256
+  // CHECK: @llvm.x86.avx2.pabs.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_abs_epi32(__W,__U,__A); 
 }
 __m256i test_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_abs_epi32
-  // CHECK: @llvm.x86.avx512.mask.pabs.d.256
+  // CHECK: @llvm.x86.avx2.pabs.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_abs_epi32(__U,__A); 
 }
 __m128i test_mm_abs_epi64(__m128i __A) {
@@ -2473,202 +2556,274 @@ __m256i test_mm256_maskz_abs_epi64(__mmask8 __U, __m256i __A) {
 }
 __m128i test_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
   return _mm_maskz_max_epi32(__M,__A,__B); 
 }
 __m128i test_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
   return _mm_mask_max_epi32(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
   return _mm256_maskz_max_epi32(__M,__A,__B); 
 }
 __m256i test_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
   return _mm256_mask_max_epi32(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_max_epi64(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_maskz_max_epi64(__M,__A,__B); 
 }
 __m128i test_mm_mask_max_epi64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_mask_max_epi64(__W,__M,__A,__B); 
 }
 __m128i test_mm_max_epi64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_max_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
   return _mm_max_epi64(__A,__B); 
 }
 __m256i test_mm256_maskz_max_epi64(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_maskz_max_epi64(__M,__A,__B); 
 }
 __m256i test_mm256_mask_max_epi64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_mask_max_epi64(__W,__M,__A,__B); 
 }
 __m256i test_mm256_max_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_max_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
   return _mm256_max_epi64(__A,__B); 
 }
 __m128i test_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epu32
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
   return _mm_maskz_max_epu32(__M,__A,__B); 
 }
 __m128i test_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu32
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
   return _mm_mask_max_epu32(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epu32
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
   return _mm256_maskz_max_epu32(__M,__A,__B); 
 }
 __m256i test_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu32
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
   return _mm256_mask_max_epu32(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_max_epu64(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epu64
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_maskz_max_epu64(__M,__A,__B); 
 }
 __m128i test_mm_max_epu64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_max_epu64
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
   return _mm_max_epu64(__A,__B); 
 }
 __m128i test_mm_mask_max_epu64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu64
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_mask_max_epu64(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_max_epu64(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epu64
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_maskz_max_epu64(__M,__A,__B); 
 }
 __m256i test_mm256_max_epu64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_max_epu64
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
   return _mm256_max_epu64(__A,__B); 
 }
 __m256i test_mm256_mask_max_epu64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu64
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_mask_max_epu64(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmins.d.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
   return _mm_maskz_min_epi32(__M,__A,__B); 
 }
 __m128i test_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmins.d.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
   return _mm_mask_min_epi32(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmins.d.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
   return _mm256_maskz_min_epi32(__M,__A,__B); 
 }
 __m256i test_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmins.d.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
   return _mm256_mask_min_epi32(__W,__M,__A,__B); 
 }
 __m128i test_mm_min_epi64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_min_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmins.q.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
   return _mm_min_epi64(__A,__B); 
 }
 __m128i test_mm_mask_min_epi64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmins.q.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_mask_min_epi64(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epi64(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmins.q.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_maskz_min_epi64(__M,__A,__B); 
 }
 __m256i test_mm256_min_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_min_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmins.q.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
   return _mm256_min_epi64(__A,__B); 
 }
 __m256i test_mm256_mask_min_epi64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmins.q.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_mask_min_epi64(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epi64(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmins.q.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_maskz_min_epi64(__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epu32
-  // CHECK: @llvm.x86.avx512.mask.pminu.d.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
   return _mm_maskz_min_epu32(__M,__A,__B); 
 }
 __m128i test_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu32
-  // CHECK: @llvm.x86.avx512.mask.pminu.d.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
   return _mm_mask_min_epu32(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epu32
-  // CHECK: @llvm.x86.avx512.mask.pminu.d.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
   return _mm256_maskz_min_epu32(__M,__A,__B); 
 }
 __m256i test_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu32
-  // CHECK: @llvm.x86.avx512.mask.pminu.d.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
   return _mm256_mask_min_epu32(__W,__M,__A,__B); 
 }
 __m128i test_mm_min_epu64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_min_epu64
-  // CHECK: @llvm.x86.avx512.mask.pminu.q.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
   return _mm_min_epu64(__A,__B); 
 }
 __m128i test_mm_mask_min_epu64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu64
-  // CHECK: @llvm.x86.avx512.mask.pminu.q.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_mask_min_epu64(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epu64(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epu64
-  // CHECK: @llvm.x86.avx512.mask.pminu.q.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <2 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_maskz_min_epu64(__M,__A,__B); 
 }
 __m256i test_mm256_min_epu64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_min_epu64
-  // CHECK: @llvm.x86.avx512.mask.pminu.q.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
   return _mm256_min_epu64(__A,__B); 
 }
 __m256i test_mm256_mask_min_epu64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu64
-  // CHECK: @llvm.x86.avx512.mask.pminu.q.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_mask_min_epu64(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epu64(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epu64
-  // CHECK: @llvm.x86.avx512.mask.pminu.q.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i64> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_maskz_min_epu64(__M,__A,__B); 
 }
 __m128d test_mm_roundscale_pd(__m128d __A) {
@@ -2953,82 +3108,98 @@ void test_mm256_mask_i32scatter_epi32(int *__addr, __mmask8 __mask,  __m256i __i
 }
 __m128d test_mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: @test_mm_mask_sqrt_pd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.128
+  // CHECK: @llvm.x86.sse2.sqrt.pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_sqrt_pd(__W,__U,__A); 
 }
 __m128d test_mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
   // CHECK-LABEL: @test_mm_maskz_sqrt_pd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.128
+  // CHECK: @llvm.x86.sse2.sqrt.pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_sqrt_pd(__U,__A); 
 }
 __m256d test_mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_sqrt_pd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.256
+  // CHECK: @llvm.x86.avx.sqrt.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_sqrt_pd(__W,__U,__A); 
 }
 __m256d test_mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_maskz_sqrt_pd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.256
+  // CHECK: @llvm.x86.avx.sqrt.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_sqrt_pd(__U,__A); 
 }
 __m128 test_mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_mask_sqrt_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.128
+  // CHECK: @llvm.x86.sse.sqrt.ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_sqrt_ps(__W,__U,__A); 
 }
 __m128 test_mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_maskz_sqrt_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.128
+  // CHECK: @llvm.x86.sse.sqrt.ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_sqrt_ps(__U,__A); 
 }
 __m256 test_mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_sqrt_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.256
+  // CHECK: @llvm.x86.avx.sqrt.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_sqrt_ps(__W,__U,__A); 
 }
 __m256 test_mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_maskz_sqrt_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.256
+  // CHECK: @llvm.x86.avx.sqrt.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_sqrt_ps(__U,__A); 
 }
 __m128d test_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_sub_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.128
+  // CHECK: fsub <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_sub_pd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_sub_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.128
+  // CHECK: fsub <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_sub_pd(__U,__A,__B); 
 }
 __m256d test_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_sub_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.256
+  // CHECK: fsub <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_sub_pd(__W,__U,__A,__B); 
 }
 __m256d test_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_sub_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.256
+  // CHECK: fsub <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_sub_pd(__U,__A,__B); 
 }
-__m128 test_mm_mask_sub_ps(__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
+__m128 test_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_sub_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.128
+  // CHECK: fsub <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_sub_ps(__W,__U,__A,__B); 
 }
-__m128 test_mm_maskz_sub_ps(__mmask16 __U, __m128 __A, __m128 __B) {
+__m128 test_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_sub_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.128
+  // CHECK: fsub <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_sub_ps(__U,__A,__B); 
 }
-__m256 test_mm256_mask_sub_ps(__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
+__m256 test_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_sub_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.256
+  // CHECK: fsub <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_sub_ps(__W,__U,__A,__B); 
 }
-__m256 test_mm256_maskz_sub_ps(__mmask16 __U, __m256 __A, __m256 __B) {
+__m256 test_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_sub_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.256
+  // CHECK: fsub <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_sub_ps(__U,__A,__B); 
 }
 __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,  __m128i __B) {
@@ -3194,241 +3365,281 @@ __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i _
 
 __m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.128
+  // CHECK: sext <4 x i8> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_cvtepi8_epi32(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.128
+  // CHECK: sext <4 x i8> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_cvtepi8_epi32(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepi8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.256
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_cvtepi8_epi32(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.256
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_cvtepi8_epi32(__U, __A); 
 }
 
 __m128i test_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.128
+  // CHECK: sext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_cvtepi8_epi64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.128
+  // CHECK: sext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_cvtepi8_epi64(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.256
+  // CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_cvtepi8_epi64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.256
+  // CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_cvtepi8_epi64(__U, __A); 
 }
 
 __m128i test_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
   // CHECK-LABEL: @test_mm_mask_cvtepi32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.128
+  // CHECK: sext <2 x i32> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_cvtepi32_epi64(__W, __U, __X); 
 }
 
 __m128i test_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.128
+  // CHECK: sext <2 x i32> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_cvtepi32_epi64(__U, __X); 
 }
 
 __m256i test_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.256
+  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_cvtepi32_epi64(__W, __U, __X); 
 }
 
 __m256i test_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.256
+  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_cvtepi32_epi64(__U, __X); 
 }
 
 __m128i test_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.128
+  // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_cvtepi16_epi32(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.128
+  // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_cvtepi16_epi32(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.256
+  // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_cvtepi16_epi32(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.256
+  // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_cvtepi16_epi32(__U, __A); 
 }
 
 __m128i test_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.128
+  // CHECK: sext <2 x i16> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_cvtepi16_epi64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.128
+  // CHECK: sext <2 x i16> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_cvtepi16_epi64(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.256
+  // CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_cvtepi16_epi64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.256
+  // CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_cvtepi16_epi64(__U, __A); 
 }
 
 __m128i test_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  // CHECK: zext <4 x i8> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_cvtepu8_epi32(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  // CHECK: zext <4 x i8> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_cvtepu8_epi32(__U, __A);
 }
 
 __m256i test_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_cvtepu8_epi32(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_cvtepu8_epi32(__U, __A); 
 }
 
 __m128i test_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  // CHECK: zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_cvtepu8_epi64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  // CHECK: zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_cvtepu8_epi64(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  // CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_cvtepu8_epi64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  // CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_cvtepu8_epi64(__U, __A); 
 }
 
 __m128i test_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
   // CHECK-LABEL: @test_mm_mask_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  // CHECK: zext <2 x i32> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_cvtepu32_epi64(__W, __U, __X); 
 }
 
 __m128i test_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  // CHECK: zext <2 x i32> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_cvtepu32_epi64(__U, __X); 
 }
 
 __m256i test_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_cvtepu32_epi64(__W, __U, __X); 
 }
 
 __m256i test_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_cvtepu32_epi64(__U, __X); 
 }
 
 __m128i test_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  // CHECK: zext <4 x i16> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_cvtepu16_epi32(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  // CHECK: zext <4 x i16> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_cvtepu16_epi32(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  // CHECK: zext <8 x i16> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_cvtepu16_epi32(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  // CHECK: zext <8 x i16> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_cvtepu16_epi32(__U, __A); 
 }
 
 __m128i test_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  // CHECK: zext <2 x i16> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_cvtepu16_epi64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  // CHECK: zext <2 x i16> %{{.*}} to <2 x i64>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_cvtepu16_epi64(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  // CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_cvtepu16_epi64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  // CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_cvtepu16_epi64(__U, __A); 
 }
 
@@ -3723,253 +3934,405 @@ __m256i test_mm256_maskz_rorv_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
 
 __m128i test_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_sllv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx2.psllv.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_sllv_epi64(__W, __U, __X, __Y); 
 }
 
 __m128i test_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_maskz_sllv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx2.psllv.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_sllv_epi64(__U, __X, __Y); 
 }
 
 __m256i test_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_sllv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx2.psllv.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_sllv_epi64(__W, __U, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_sllv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx2.psllv.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_sllv_epi64(__U, __X, __Y); 
 }
 
 __m128i test_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_sllv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx2.psllv.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_sllv_epi32(__W, __U, __X, __Y); 
 }
 
 __m128i test_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_maskz_sllv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx2.psllv.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_sllv_epi32(__U, __X, __Y); 
 }
 
 __m256i test_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_sllv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx2.psllv.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_sllv_epi32(__W, __U, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_sllv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx2.psllv.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_sllv_epi32(__U, __X, __Y); 
 }
 
 __m128i test_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_srlv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx2.psrlv.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_srlv_epi64(__W, __U, __X, __Y); 
 }
 
 __m128i test_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_maskz_srlv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx2.psrlv.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_srlv_epi64(__U, __X, __Y); 
 }
 
 __m256i test_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_srlv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx2.psrlv.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_srlv_epi64(__W, __U, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_srlv_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx2.psrlv.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_srlv_epi64(__U, __X, __Y); 
 }
 
 __m128i test_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_srlv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx2.psrlv.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_srlv_epi32(__W, __U, __X, __Y); 
 }
 
 __m128i test_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_maskz_srlv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx2.psrlv.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_srlv_epi32(__U, __X, __Y); 
 }
 
 __m256i test_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_srlv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx2.psrlv.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_srlv_epi32(__W, __U, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_srlv_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrlv
+  // CHECK: @llvm.x86.avx2.psrlv.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_srlv_epi32(__U, __X, __Y); 
 }
 
 __m128i test_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_srl_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.d.128
+  // CHECK: @llvm.x86.sse2.psrl.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_srl_epi32(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_srl_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.d.128
+  // CHECK: @llvm.x86.sse2.psrl.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_srl_epi32(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_mask_srl_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.d.256
+  // CHECK: @llvm.x86.avx2.psrl.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_srl_epi32(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_maskz_srl_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.d.256
+  // CHECK: @llvm.x86.avx2.psrl.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_srl_epi32(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_srli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.di.128
+  // CHECK: @llvm.x86.sse2.psrli.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_srli_epi32(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_srli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.di.128
+  // CHECK: @llvm.x86.sse2.psrli.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_srli_epi32(__U, __A, 5); 
 }
 
 __m256i test_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_srli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.di.256
+  // CHECK: @llvm.x86.avx2.psrli.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_srli_epi32(__W, __U, __A, 5); 
 }
 
 __m256i test_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_srli_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrl.di.256
+  // CHECK: @llvm.x86.avx2.psrli.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_srli_epi32(__U, __A, 5); 
 }
 
 __m128i test_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_srl_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.q.128
+  // CHECK: @llvm.x86.sse2.psrl.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_srl_epi64(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_srl_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.q.128
+  // CHECK: @llvm.x86.sse2.psrl.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_srl_epi64(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_mask_srl_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.q.256
+  // CHECK: @llvm.x86.avx2.psrl.q
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_srl_epi64(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_maskz_srl_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.q.256
+  // CHECK: @llvm.x86.avx2.psrl.q
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_srl_epi64(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_srli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.qi.128
+  // CHECK: @llvm.x86.sse2.psrli.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_srli_epi64(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_srli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.qi.128
+  // CHECK: @llvm.x86.sse2.psrli.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_srli_epi64(__U, __A, 5); 
 }
 
 __m256i test_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_srli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.qi.256
+  // CHECK: @llvm.x86.avx2.psrli.q
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_srli_epi64(__W, __U, __A, 5); 
 }
 
 __m256i test_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_srli_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrl.qi.256
+  // CHECK: @llvm.x86.avx2.psrli.q
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_srli_epi64(__U, __A, 5); 
 }
 
+__m128i test_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sll_epi32
+  // CHECK: @llvm.x86.sse2.psll.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_sll_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sll_epi32
+  // CHECK: @llvm.x86.sse2.psll.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_sll_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sll_epi32
+  // CHECK: @llvm.x86.avx2.psll.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_sll_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sll_epi32
+  // CHECK: @llvm.x86.avx2.psll.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_sll_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_slli_epi32
+  // CHECK: @llvm.x86.sse2.pslli.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_slli_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_slli_epi32
+  // CHECK: @llvm.x86.sse2.pslli.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_slli_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_slli_epi32
+  // CHECK: @llvm.x86.avx2.pslli.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_slli_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_slli_epi32
+  // CHECK: @llvm.x86.avx2.pslli.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_slli_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sll_epi64
+  // CHECK: @llvm.x86.sse2.psll.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_sll_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sll_epi64
+  // CHECK: @llvm.x86.sse2.psll.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_sll_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sll_epi64
+  // CHECK: @llvm.x86.avx2.psll.q
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_sll_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sll_epi64
+  // CHECK: @llvm.x86.avx2.psll.q
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_sll_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_slli_epi64
+  // CHECK: @llvm.x86.sse2.pslli.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_slli_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_slli_epi64
+  // CHECK: @llvm.x86.sse2.pslli.q
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_slli_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_slli_epi64
+  // CHECK: @llvm.x86.avx2.pslli.q
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_slli_epi64(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_slli_epi64
+  // CHECK: @llvm.x86.avx2.pslli.q
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_slli_epi64(__U, __A, 5); 
+}
+
 __m128i test_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_srav_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrav
+  // CHECK: @llvm.x86.avx2.psrav.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_srav_epi32(__W, __U, __X, __Y); 
 }
 
 __m128i test_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_maskz_srav_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrav
+  // CHECK: @llvm.x86.avx2.psrav.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_srav_epi32(__U, __X, __Y); 
 }
 
 __m256i test_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_srav_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrav
+  // CHECK: @llvm.x86.avx2.psrav.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_srav_epi32(__W, __U, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_srav_epi32
-  // CHECK: @llvm.x86.avx512.mask.psrav
+  // CHECK: @llvm.x86.avx2.psrav.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_srav_epi32(__U, __X, __Y); 
 }
 
 __m128i test_mm_srav_epi64(__m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  // CHECK: @llvm.x86.avx512.psrav.q.128
   return _mm_srav_epi64(__X, __Y); 
 }
 
 __m128i test_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  // CHECK: @llvm.x86.avx512.psrav.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_srav_epi64(__W, __U, __X, __Y); 
 }
 
 __m128i test_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_maskz_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  // CHECK: @llvm.x86.avx512.psrav.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_srav_epi64(__U, __X, __Y); 
 }
 
 __m256i test_mm256_srav_epi64(__m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  // CHECK: @llvm.x86.avx512.psrav.q.256
   return _mm256_srav_epi64(__X, __Y); 
 }
 
 __m256i test_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  // CHECK: @llvm.x86.avx512.psrav.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_srav_epi64(__W, __U, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_srav_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_srav_epi64
-  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  // CHECK: @llvm.x86.avx512.psrav.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_srav_epi64(__U, __X, __Y); 
 }
 
@@ -4145,6 +4508,7 @@ __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) {
   return _mm256_maskz_set1_epi32(__M, 5); 
 }
 
+#ifdef __x86_64__
 __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: @test_mm_mask_set1_epi64
   // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.128
@@ -4168,6 +4532,7 @@ __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
   // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.256
   return _mm256_maskz_set1_epi64(__M, __A); 
 }
+#endif
 
 __m128d test_mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C) {
   // CHECK-LABEL: @test_mm_fixupimm_pd
@@ -4699,49 +5064,57 @@ __m256 test_mm256_maskz_permute_ps(__mmask8 __U, __m256 __X) {
 
 __m128d test_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) {
   // CHECK-LABEL: @test_mm_mask_permutevar_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd
+  // CHECK: @llvm.x86.avx.vpermilvar.pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_permutevar_pd(__W, __U, __A, __C); 
 }
 
 __m128d test_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) {
   // CHECK-LABEL: @test_mm_maskz_permutevar_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd
+  // CHECK: @llvm.x86.avx.vpermilvar.pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_permutevar_pd(__U, __A, __C); 
 }
 
 __m256d test_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) {
   // CHECK-LABEL: @test_mm256_mask_permutevar_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.256
+  // CHECK: @llvm.x86.avx.vpermilvar.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_permutevar_pd(__W, __U, __A, __C); 
 }
 
 __m256d test_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) {
   // CHECK-LABEL: @test_mm256_maskz_permutevar_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.256
+  // CHECK: @llvm.x86.avx.vpermilvar.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_permutevar_pd(__U, __A, __C); 
 }
 
 __m128 test_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) {
   // CHECK-LABEL: @test_mm_mask_permutevar_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps
+  // CHECK: @llvm.x86.avx.vpermilvar.ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_permutevar_ps(__W, __U, __A, __C); 
 }
 
 __m128 test_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) {
   // CHECK-LABEL: @test_mm_maskz_permutevar_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps
+  // CHECK: @llvm.x86.avx.vpermilvar.ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_permutevar_ps(__U, __A, __C); 
 }
 
 __m256 test_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) {
   // CHECK-LABEL: @test_mm256_mask_permutevar_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.256
+  // CHECK: @llvm.x86.avx.vpermilvar.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_permutevar_ps(__W, __U, __A, __C); 
 }
 
 __m256 test_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) {
   // CHECK-LABEL: @test_mm256_maskz_permutevar_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.256
+  // CHECK: @llvm.x86.avx.vpermilvar.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_permutevar_ps(__U, __A, __C); 
 }
 
@@ -4954,121 +5327,137 @@ __m256i test_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 
 __m128i test_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_sra_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.d.128
+  // CHECK: @llvm.x86.sse2.psra.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_sra_epi32(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_sra_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.d.128
+  // CHECK: @llvm.x86.sse2.psra.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_sra_epi32(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_mask_sra_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.d.256
+  // CHECK: @llvm.x86.avx2.psra.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_sra_epi32(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_maskz_sra_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.d.256
+  // CHECK: @llvm.x86.avx2.psra.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_sra_epi32(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_srai_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.di.128
+  // CHECK: @llvm.x86.sse2.psrai.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_srai_epi32(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_srai_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.di.128
+  // CHECK: @llvm.x86.sse2.psrai.d
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_srai_epi32(__U, __A, 5); 
 }
 
 __m256i test_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_srai_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.di.256
+  // CHECK: @llvm.x86.avx2.psrai.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_srai_epi32(__W, __U, __A, 5); 
 }
 
 __m256i test_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_srai_epi32
-  // CHECK: @llvm.x86.avx512.mask.psra.di.256
+  // CHECK: @llvm.x86.avx2.psrai.d
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_srai_epi32(__U, __A, 5); 
 }
 
 __m128i test_mm_sra_epi64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  // CHECK: @llvm.x86.avx512.psra.q.128
   return _mm_sra_epi64(__A, __B); 
 }
 
 __m128i test_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  // CHECK: @llvm.x86.avx512.psra.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_sra_epi64(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  // CHECK: @llvm.x86.avx512.psra.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_sra_epi64(__U, __A, __B); 
 }
 
 __m256i test_mm256_sra_epi64(__m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  // CHECK: @llvm.x86.avx512.psra.q.256
   return _mm256_sra_epi64(__A, __B); 
 }
 
 __m256i test_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_mask_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  // CHECK: @llvm.x86.avx512.psra.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_sra_epi64(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_maskz_sra_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  // CHECK: @llvm.x86.avx512.psra.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_sra_epi64(__U, __A, __B); 
 }
 
 __m128i test_mm_srai_epi64(__m128i __A) {
   // CHECK-LABEL: @test_mm_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  // CHECK: @llvm.x86.avx512.psrai.q.128
   return _mm_srai_epi64(__A, 5); 
 }
 
 __m128i test_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  // CHECK: @llvm.x86.avx512.psrai.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_srai_epi64(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  // CHECK: @llvm.x86.avx512.psrai.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_srai_epi64(__U, __A, 5); 
 }
 
 __m256i test_mm256_srai_epi64(__m256i __A) {
   // CHECK-LABEL: @test_mm256_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  // CHECK: @llvm.x86.avx512.psrai.q.256
   return _mm256_srai_epi64(__A, 5); 
 }
 
 __m256i test_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  // CHECK: @llvm.x86.avx512.psrai.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_srai_epi64(__W, __U, __A, 5); 
 }
 
 __m256i test_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_srai_epi64
-  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  // CHECK: @llvm.x86.avx512.psrai.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_srai_epi64(__U, __A, 5); 
 }
 
@@ -6199,73 +6588,81 @@ void test_mm256_mask_cvtepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i __A
 
 __m128 test_mm256_extractf32x4_ps(__m256 __A) {
   // CHECK-LABEL: @test_mm256_extractf32x4_ps
-  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf32x4_ps(__A, 1); 
 }
 
 __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_extractf32x4_ps
-  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1); 
 }
 
 __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_maskz_extractf32x4_ps
-  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_maskz_extractf32x4_ps(__U, __A, 1); 
 }
 
 __m128i test_mm256_extracti32x4_epi32(__m256i __A) {
   // CHECK-LABEL: @test_mm256_extracti32x4_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extracti32x4_epi32(__A, 1); 
 }
 
 __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_extracti32x4_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1); 
 }
 
 __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_extracti32x4_epi32
-  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_maskz_extracti32x4_epi32(__U, __A, 1); 
 }
 
 __m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm256_insertf32x4
-  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_insertf32x4(__A, __B, 1); 
 }
 
 __m256 test_mm256_mask_insertf32x4(__m256 __W, __mmask8 __U, __m256 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm256_mask_insertf32x4
-  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_insertf32x4(__W, __U, __A, __B, 1); 
 }
 
 __m256 test_mm256_maskz_insertf32x4(__mmask8 __U, __m256 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm256_maskz_insertf32x4
-  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_insertf32x4(__U, __A, __B, 1); 
 }
 
 __m256i test_mm256_inserti32x4(__m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_inserti32x4
-  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_inserti32x4(__A, __B, 1); 
 }
 
 __m256i test_mm256_mask_inserti32x4(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_mask_inserti32x4
-  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_inserti32x4(__W, __U, __A, __B, 1); 
 }
 
 __m256i test_mm256_maskz_inserti32x4(__mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_maskz_inserti32x4
-  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_inserti32x4(__U, __A, __B, 1); 
 }
 
@@ -6545,73 +6942,81 @@ __m256i test_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X
 
 __m128i test_mm_alignr_epi32(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   return _mm_alignr_epi32(__A, __B, 1);
 }
 
 __m128i test_mm_mask_alignr_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_alignr_epi32(__W, __U, __A, __B, 1);
 }
 
 __m128i test_mm_maskz_alignr_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_alignr_epi32(__U, __A, __B, 1);
 }
 
 __m256i test_mm256_alignr_epi32(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   return _mm256_alignr_epi32(__A, __B, 1);
 }
 
 __m256i test_mm256_mask_alignr_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_alignr_epi32(__W, __U, __A, __B, 1);
 }
 
 __m256i test_mm256_maskz_alignr_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_alignr_epi32
-  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_alignr_epi32(__U, __A, __B, 1);
 }
 
 __m128i test_mm_alignr_epi64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 2>
   return _mm_alignr_epi64(__A, __B, 1);
 }
 
 __m128i test_mm_mask_alignr_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_alignr_epi64(__W, __U, __A, __B, 1);
 }
 
 __m128i test_mm_maskz_alignr_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_alignr_epi64(__U, __A, __B, 1);
 }
 
 __m256i test_mm256_alignr_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   return _mm256_alignr_epi64(__A, __B, 1);
 }
 
 __m256i test_mm256_mask_alignr_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_alignr_epi64(__W, __U, __A, __B, 1);
 }
 
 __m256i test_mm256_maskz_alignr_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_alignr_epi64
-  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_alignr_epi64(__U, __A, __B, 1);
 }
 
diff --git a/test/CodeGen/avx512vlbw-builtins.c b/test/CodeGen/avx512vlbw-builtins.c
index 6bfa09f5d1c6..fe363c9ffde5 100644
--- a/test/CodeGen/avx512vlbw-builtins.c
+++ b/test/CodeGen/avx512vlbw-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
@@ -737,119 +735,140 @@ __mmask16 test_mm256_mask_cmp_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b
 
 __m256i test_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
   //CHECK-LABEL: @test_mm256_mask_add_epi8
-  //CHECK: @llvm.x86.avx512.mask.padd.b.256
+  //CHECK: add <32 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_add_epi8(__W, __U , __A, __B);
 }
 
 __m256i test_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_add_epi8
-  //CHECK: @llvm.x86.avx512.mask.padd.b.256
+  //CHECK: add <32 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_add_epi8(__U , __A, __B);
 }
 __m256i test_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_add_epi16
-  //CHECK: @llvm.x86.avx512.mask.padd.w.256
+  //CHECK: add <16 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_add_epi16(__W, __U , __A, __B);
 }
 
 __m256i test_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_add_epi16
-  //CHECK: @llvm.x86.avx512.mask.padd.w.256
+  //CHECK: add <16 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_add_epi16(__U , __A, __B);
 }
 
 __m256i test_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_sub_epi8
-  //CHECK: @llvm.x86.avx512.mask.psub.b.256
+  //CHECK: sub <32 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_sub_epi8(__W, __U , __A, __B);
 }
 
 __m256i test_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_sub_epi8
-  //CHECK: @llvm.x86.avx512.mask.psub.b.256
+  //CHECK: sub <32 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_sub_epi8(__U , __A, __B);
 }
 
 __m256i test_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_sub_epi16
-  //CHECK: @llvm.x86.avx512.mask.psub.w.256
+  //CHECK: sub <16 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_sub_epi16(__W, __U , __A, __B);
 }
 
 __m256i test_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_sub_epi16
-  //CHECK: @llvm.x86.avx512.mask.psub.w.256
+  //CHECK: sub <16 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_sub_epi16(__U , __A, __B);
 }
+
 __m128i test_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_add_epi8
-  //CHECK: @llvm.x86.avx512.mask.padd.b.128
+  //CHECK: add <16 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_add_epi8(__W, __U , __A, __B);
 }
 
 __m128i test_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_add_epi8
-  //CHECK: @llvm.x86.avx512.mask.padd.b.128
+  //CHECK: add <16 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_add_epi8(__U , __A, __B);
 }
 
 __m128i test_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_add_epi16
-  //CHECK: @llvm.x86.avx512.mask.padd.w.128
+  //CHECK: add <8 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_add_epi16(__W, __U , __A, __B);
 }
 
 __m128i test_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_add_epi16
-  //CHECK: @llvm.x86.avx512.mask.padd.w.128
+  //CHECK: add <8 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_add_epi16(__U , __A, __B);
 }
 
 __m128i test_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_sub_epi8
-  //CHECK: @llvm.x86.avx512.mask.psub.b.128
+  //CHECK: sub <16 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_sub_epi8(__W, __U , __A, __B);
 }
 
 __m128i test_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_sub_epi8
-  //CHECK: @llvm.x86.avx512.mask.psub.b.128
+  //CHECK: sub <16 x i8> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_sub_epi8(__U , __A, __B);
 }
 
 __m128i test_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_sub_epi16
-  //CHECK: @llvm.x86.avx512.mask.psub.w.128
+  //CHECK: sub <8 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_sub_epi16(__W, __U , __A, __B);
 }
 
 __m128i test_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_sub_epi16
-  //CHECK: @llvm.x86.avx512.mask.psub.w.128
+  //CHECK: sub <8 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_sub_epi16(__U , __A, __B);
 }
 
 __m256i test_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_mullo_epi16
-  //CHECK: @llvm.x86.avx512.mask.pmull.w.256
+  //CHECK: mul <16 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_mullo_epi16(__W, __U , __A, __B);
 }
 
 __m256i test_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_mullo_epi16
-  //CHECK: @llvm.x86.avx512.mask.pmull.w.256
+  //CHECK: mul <16 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_mullo_epi16(__U , __A, __B);
 }
 
 __m128i test_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_mullo_epi16
-  //CHECK: @llvm.x86.avx512.mask.pmull.w.128
+  //CHECK: mul <8 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_mullo_epi16(__W, __U , __A, __B);
 }
 
 __m128i test_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_mullo_epi16
-  //CHECK: @llvm.x86.avx512.mask.pmull.w.128
+  //CHECK: mul <8 x i16> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_mullo_epi16(__U , __A, __B);
 }
 
@@ -879,519 +898,651 @@ __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) {
 
 __m128i test_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_abs_epi8
-  // CHECK: @llvm.x86.avx512.mask.pabs.b.128
+  // CHECK: @llvm.x86.ssse3.pabs.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_abs_epi8(__W,__U,__A); 
 }
 
 __m128i test_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_abs_epi8
-  // CHECK: @llvm.x86.avx512.mask.pabs.b.128
+  // CHECK: @llvm.x86.ssse3.pabs.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_abs_epi8(__U,__A); 
 }
 
 __m256i test_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_abs_epi8
-  // CHECK: @llvm.x86.avx512.mask.pabs.b.256
+  // CHECK: @llvm.x86.avx2.pabs.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_abs_epi8(__W,__U,__A); 
 }
 
 __m256i test_mm256_maskz_abs_epi8(__mmask32 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_abs_epi8
-  // CHECK: @llvm.x86.avx512.mask.pabs.b.256
+  // CHECK: @llvm.x86.avx2.pabs.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_abs_epi8(__U,__A); 
 }
 
 __m128i test_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_abs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pabs.w.128
+  // CHECK: @llvm.x86.ssse3.pabs.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_abs_epi16(__W,__U,__A); 
 }
 
 __m128i test_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_abs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pabs.w.128
+  // CHECK: @llvm.x86.ssse3.pabs.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_abs_epi16(__U,__A); 
 }
 
 __m256i test_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_abs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pabs.w.256
+  // CHECK: @llvm.x86.avx2.pabs.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_abs_epi16(__W,__U,__A); 
 }
 
 __m256i test_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_abs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pabs.w.256
+  // CHECK: @llvm.x86.avx2.pabs.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_abs_epi16(__U,__A); 
 }
 
 __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_packs_epi32
-  // CHECK: @llvm.x86.avx512.mask.packssdw.128
+  // CHECK: @llvm.x86.sse2.packssdw
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packs_epi32(__M,__A,__B); 
 }
 __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A,          __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packs_epi32
-  // CHECK: @llvm.x86.avx512.mask.packssdw.128
+  // CHECK: @llvm.x86.sse2.packssdw
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_packs_epi32(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_packs_epi32
-  // CHECK: @llvm.x86.avx512.mask.packssdw.256
+  // CHECK: @llvm.x86.avx2.packssdw
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_packs_epi32(__M,__A,__B); 
 }
 __m256i test_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A,       __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_packs_epi32
-  // CHECK: @llvm.x86.avx512.mask.packssdw.256
+  // CHECK: @llvm.x86.avx2.packssdw
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_packs_epi32(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_packs_epi16
-  // CHECK: @llvm.x86.avx512.mask.packsswb.128
+  // CHECK: @llvm.x86.sse2.packsswb
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_packs_epi16(__M,__A,__B); 
 }
 __m128i test_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A,          __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packs_epi16
-  // CHECK: @llvm.x86.avx512.mask.packsswb.128
+  // CHECK: @llvm.x86.sse2.packsswb
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_packs_epi16(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_packs_epi16
-  // CHECK: @llvm.x86.avx512.mask.packsswb.256
+  // CHECK: @llvm.x86.avx2.packsswb
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_packs_epi16(__M,__A,__B); 
 }
 __m256i test_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A,       __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_packs_epi16
-  // CHECK: @llvm.x86.avx512.mask.packsswb.256
+  // CHECK: @llvm.x86.avx2.packsswb
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_packs_epi16(__W,__M,__A,__B); 
 }
 
 __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask16 __M, __m128i __A,           __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packus_epi32
-  // CHECK: @llvm.x86.avx512.mask.packusdw.128
+  // CHECK: @llvm.x86.sse41.packusdw
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_packus_epi32(__W,__M,__A,__B); 
 }
 
 __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_packus_epi32
-  // CHECK: @llvm.x86.avx512.mask.packusdw.128
+  // CHECK: @llvm.x86.sse41.packusdw
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packus_epi32(__M,__A,__B); 
 }
 
 __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_packus_epi32
-  // CHECK: @llvm.x86.avx512.mask.packusdw.256
+  // CHECK: @llvm.x86.avx2.packusdw
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_packus_epi32(__M,__A,__B); 
 }
 
 __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A,        __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_packus_epi32
-  // CHECK: @llvm.x86.avx512.mask.packusdw.256
+  // CHECK: @llvm.x86.avx2.packusdw
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_packus_epi32(__W,__M,__A,__B); 
 }
 
 __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_packus_epi16
-  // CHECK: @llvm.x86.avx512.mask.packuswb.128
+  // CHECK: @llvm.x86.sse2.packuswb
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_packus_epi16(__M,__A,__B); 
 }
 
 __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A,           __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packus_epi16
-  // CHECK: @llvm.x86.avx512.mask.packuswb.128
+  // CHECK: @llvm.x86.sse2.packuswb
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_packus_epi16(__W,__M,__A,__B); 
 }
 
 __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_packus_epi16
-  // CHECK: @llvm.x86.avx512.mask.packuswb.256
+  // CHECK: @llvm.x86.avx2.packuswb
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_packus_epi16(__M,__A,__B); 
 }
 
 __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A,        __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_packus_epi16
-  // CHECK: @llvm.x86.avx512.mask.packuswb.256
+  // CHECK: @llvm.x86.avx2.packuswb
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_packus_epi16(__W,__M,__A,__B); 
 }
 
 __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_adds_epi8
-  // CHECK: @llvm.x86.avx512.mask.padds.b.128
+  // CHECK: @llvm.x86.sse2.padds.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_adds_epi8(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_adds_epi8
-  // CHECK: @llvm.x86.avx512.mask.padds.b.128
+  // CHECK: @llvm.x86.sse2.padds.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_adds_epi8(__U,__A,__B); 
 }
 __m256i test_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_adds_epi8
-  // CHECK: @llvm.x86.avx512.mask.padds.b.256
+  // CHECK: @llvm.x86.avx2.padds.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_adds_epi8(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_adds_epi8
-  // CHECK: @llvm.x86.avx512.mask.padds.b.256
+  // CHECK: @llvm.x86.avx2.padds.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_adds_epi8(__U,__A,__B); 
 }
 __m128i test_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A,         __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_adds_epi16
-  // CHECK: @llvm.x86.avx512.mask.padds.w.128
+  // CHECK: @llvm.x86.sse2.padds.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_adds_epi16(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_adds_epi16
-  // CHECK: @llvm.x86.avx512.mask.padds.w.128
+  // CHECK: @llvm.x86.sse2.padds.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_adds_epi16(__U,__A,__B); 
 }
 __m256i test_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A,      __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_adds_epi16
-  // CHECK: @llvm.x86.avx512.mask.padds.w.256
+  // CHECK: @llvm.x86.avx2.padds.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_adds_epi16(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_adds_epi16
-  // CHECK: @llvm.x86.avx512.mask.padds.w.256
+  // CHECK: @llvm.x86.avx2.padds.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_adds_epi16(__U,__A,__B); 
 }
 __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_adds_epu8
-  // CHECK: @llvm.x86.avx512.mask.paddus.b.128
+  // CHECK: @llvm.x86.sse2.paddus.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_adds_epu8(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_adds_epu8
-  // CHECK: @llvm.x86.avx512.mask.paddus.b.128
+  // CHECK: @llvm.x86.sse2.paddus.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_adds_epu8(__U,__A,__B); 
 }
 __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_adds_epu8
-  // CHECK: @llvm.x86.avx512.mask.paddus.b.256
+  // CHECK: @llvm.x86.avx2.paddus.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_adds_epu8(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_adds_epu8
-  // CHECK: @llvm.x86.avx512.mask.paddus.b.256
+  // CHECK: @llvm.x86.avx2.paddus.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_adds_epu8(__U,__A,__B); 
 }
 __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A,         __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_adds_epu16
-  // CHECK: @llvm.x86.avx512.mask.paddus.w.128
+  // CHECK: @llvm.x86.sse2.paddus.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_adds_epu16(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_adds_epu16
-  // CHECK: @llvm.x86.avx512.mask.paddus.w.128
+  // CHECK: @llvm.x86.sse2.paddus.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_adds_epu16(__U,__A,__B); 
 }
 __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A,      __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_adds_epu16
-  // CHECK: @llvm.x86.avx512.mask.paddus.w.256
+  // CHECK: @llvm.x86.avx2.paddus.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_adds_epu16(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_adds_epu16
-  // CHECK: @llvm.x86.avx512.mask.paddus.w.256
+  // CHECK: @llvm.x86.avx2.paddus.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_adds_epu16(__U,__A,__B); 
 }
 __m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A,       __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_avg_epu8
-  // CHECK: @llvm.x86.avx512.mask.pavg.b.128
+  // CHECK: @llvm.x86.sse2.pavg.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_avg_epu8(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_avg_epu8
-  // CHECK: @llvm.x86.avx512.mask.pavg.b.128
+  // CHECK: @llvm.x86.sse2.pavg.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_avg_epu8(__U,__A,__B); 
 }
 __m256i test_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A,          __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_avg_epu8
-  // CHECK: @llvm.x86.avx512.mask.pavg.b.256
+  // CHECK: @llvm.x86.avx2.pavg.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_avg_epu8(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_avg_epu8
-  // CHECK: @llvm.x86.avx512.mask.pavg.b.256
+  // CHECK: @llvm.x86.avx2.pavg.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_avg_epu8(__U,__A,__B); 
 }
 __m128i test_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_avg_epu16
-  // CHECK: @llvm.x86.avx512.mask.pavg.w.128
+  // CHECK: @llvm.x86.sse2.pavg.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_avg_epu16(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_avg_epu16
-  // CHECK: @llvm.x86.avx512.mask.pavg.w.128
+  // CHECK: @llvm.x86.sse2.pavg.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_avg_epu16(__U,__A,__B); 
 }
 __m256i test_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_avg_epu16
-  // CHECK: @llvm.x86.avx512.mask.pavg.w.256
+  // CHECK: @llvm.x86.avx2.pavg.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_avg_epu16(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_avg_epu16
-  // CHECK: @llvm.x86.avx512.mask.pavg.w.256
+  // CHECK: @llvm.x86.avx2.pavg.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_avg_epu16(__U,__A,__B); 
 }
 __m128i test_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.b.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_maskz_max_epi8(__M,__A,__B); 
 }
 __m128i test_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A,       __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.b.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_mask_max_epi8(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.b.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_maskz_max_epi8(__M,__A,__B); 
 }
 __m256i test_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A,          __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.b.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_mask_max_epi8(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.w.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_maskz_max_epi16(__M,__A,__B); 
 }
 __m128i test_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.w.128
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_mask_max_epi16(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.w.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_maskz_max_epi16(__M,__A,__B); 
 }
 __m256i test_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaxs.w.256
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_mask_max_epi16(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epu8
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.b.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_maskz_max_epu8(__M,__A,__B); 
 }
 __m128i test_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A,       __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu8
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.b.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_mask_max_epu8(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epu8
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.b.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_maskz_max_epu8(__M,__A,__B); 
 }
 __m256i test_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A,          __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu8
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.b.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_mask_max_epu8(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.w.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_maskz_max_epu16(__M,__A,__B); 
 }
 __m128i test_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.w.128
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_mask_max_epu16(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.w.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_maskz_max_epu16(__M,__A,__B); 
 }
 __m256i test_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmaxu.w.256
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_mask_max_epu16(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmins.b.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_maskz_min_epi8(__M,__A,__B); 
 }
 __m128i test_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A,       __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmins.b.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_mask_min_epi8(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmins.b.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_maskz_min_epi8(__M,__A,__B); 
 }
 __m256i test_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A,          __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmins.b.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_mask_min_epi8(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmins.w.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_maskz_min_epi16(__M,__A,__B); 
 }
 __m128i test_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmins.w.128
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_mask_min_epi16(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmins.w.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_maskz_min_epi16(__M,__A,__B); 
 }
 __m256i test_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmins.w.256
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_mask_min_epi16(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epu8
-  // CHECK: @llvm.x86.avx512.mask.pminu.b.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_maskz_min_epu8(__M,__A,__B); 
 }
 __m128i test_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A,       __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu8
-  // CHECK: @llvm.x86.avx512.mask.pminu.b.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_mask_min_epu8(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epu8
-  // CHECK: @llvm.x86.avx512.mask.pminu.b.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_maskz_min_epu8(__M,__A,__B); 
 }
 __m256i test_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A,          __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu8
-  // CHECK: @llvm.x86.avx512.mask.pminu.b.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_mask_min_epu8(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epu16
-  // CHECK: @llvm.x86.avx512.mask.pminu.w.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_maskz_min_epu16(__M,__A,__B); 
 }
 __m128i test_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu16
-  // CHECK: @llvm.x86.avx512.mask.pminu.w.128
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_mask_min_epu16(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epu16
-  // CHECK: @llvm.x86.avx512.mask.pminu.w.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_maskz_min_epu16(__M,__A,__B); 
 }
 __m256i test_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu16
-  // CHECK: @llvm.x86.avx512.mask.pminu.w.256
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_mask_min_epu16(__W,__M,__A,__B); 
 }
 __m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A,           __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_shuffle_epi8
-  // CHECK: @llvm.x86.avx512.mask.pshuf.b.128
+  // CHECK: @llvm.x86.ssse3.pshuf.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_shuffle_epi8
-  // CHECK: @llvm.x86.avx512.mask.pshuf.b.128
+  // CHECK: @llvm.x86.ssse3.pshuf.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_shuffle_epi8(__U,__A,__B); 
 }
 __m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A,        __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_epi8
-  // CHECK: @llvm.x86.avx512.mask.pshuf.b.256
+  // CHECK: @llvm.x86.avx2.pshuf.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_epi8
-  // CHECK: @llvm.x86.avx512.mask.pshuf.b.256
+  // CHECK: @llvm.x86.avx2.pshuf.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_shuffle_epi8(__U,__A,__B); 
 }
 __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_subs_epi8
-  // CHECK: @llvm.x86.avx512.mask.psubs.b.128
+  // CHECK: @llvm.x86.sse2.psubs.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_subs_epi8(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_subs_epi8
-  // CHECK: @llvm.x86.avx512.mask.psubs.b.128
+  // CHECK: @llvm.x86.sse2.psubs.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_subs_epi8(__U,__A,__B); 
 }
 __m256i test_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_subs_epi8
-  // CHECK: @llvm.x86.avx512.mask.psubs.b.256
+  // CHECK: @llvm.x86.avx2.psubs.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_subs_epi8(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_subs_epi8
-  // CHECK: @llvm.x86.avx512.mask.psubs.b.256
+  // CHECK: @llvm.x86.avx2.psubs.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_subs_epi8(__U,__A,__B); 
 }
 __m128i test_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A,         __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_subs_epi16
-  // CHECK: @llvm.x86.avx512.mask.psubs.w.128
+  // CHECK: @llvm.x86.sse2.psubs.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_subs_epi16(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_subs_epi16
-  // CHECK: @llvm.x86.avx512.mask.psubs.w.128
+  // CHECK: @llvm.x86.sse2.psubs.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_subs_epi16(__U,__A,__B); 
 }
 __m256i test_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A,      __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_subs_epi16
-  // CHECK: @llvm.x86.avx512.mask.psubs.w.256
+  // CHECK: @llvm.x86.avx2.psubs.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_subs_epi16(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_subs_epi16
-  // CHECK: @llvm.x86.avx512.mask.psubs.w.256
+  // CHECK: @llvm.x86.avx2.psubs.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_subs_epi16(__U,__A,__B); 
 }
 __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A,        __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_subs_epu8
-  // CHECK: @llvm.x86.avx512.mask.psubus.b.128
+  // CHECK: @llvm.x86.sse2.psubus.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_subs_epu8(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_subs_epu8
-  // CHECK: @llvm.x86.avx512.mask.psubus.b.128
+  // CHECK: @llvm.x86.sse2.psubus.b
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_subs_epu8(__U,__A,__B); 
 }
 __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A,           __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_subs_epu8
-  // CHECK: @llvm.x86.avx512.mask.psubus.b.256
+  // CHECK: @llvm.x86.avx2.psubus.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_subs_epu8(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_subs_epu8
-  // CHECK: @llvm.x86.avx512.mask.psubus.b.256
+  // CHECK: @llvm.x86.avx2.psubus.b
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_subs_epu8(__U,__A,__B); 
 }
 __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A,         __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_subs_epu16
-  // CHECK: @llvm.x86.avx512.mask.psubus.w.128
+  // CHECK: @llvm.x86.sse2.psubus.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_subs_epu16(__W,__U,__A,__B); 
 }
 __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_subs_epu16
-  // CHECK: @llvm.x86.avx512.mask.psubus.w.128
+  // CHECK: @llvm.x86.sse2.psubus.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_subs_epu16(__U,__A,__B); 
 }
 __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,      __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_subs_epu16
-  // CHECK: @llvm.x86.avx512.mask.psubus.w.256
+  // CHECK: @llvm.x86.avx2.psubus.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_subs_epu16(__W,__U,__A,__B); 
 }
 __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_subs_epu16
-  // CHECK: @llvm.x86.avx512.mask.psubus.w.256
+  // CHECK: @llvm.x86.avx2.psubus.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_subs_epu16(__U,__A,__B); 
 }
 
@@ -1439,49 +1590,57 @@ __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A,
 }
 __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_maddubs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddubs.w.128
+  // CHECK: @llvm.x86.ssse3.pmadd.ub.sw
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_maddubs_epi16(__W, __U, __X, __Y); 
 }
 
 __m128i test_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_maskz_maddubs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddubs.w.128
+  // CHECK: @llvm.x86.ssse3.pmadd.ub.sw
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_maddubs_epi16(__U, __X, __Y); 
 }
 
 __m256i test_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_maddubs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddubs.w.256
+  // CHECK: @llvm.x86.avx2.pmadd.ub.sw
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_maddubs_epi16(__W, __U, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_maddubs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddubs.w.256
+  // CHECK: @llvm.x86.avx2.pmadd.ub.sw
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_maddubs_epi16(__U, __X, __Y); 
 }
 
 __m128i test_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_madd_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddw.d.128
+  // CHECK: @llvm.x86.sse2.pmadd.wd
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_madd_epi16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_madd_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddw.d.128
+  // CHECK: @llvm.x86.sse2.pmadd.wd
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_madd_epi16(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_madd_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddw.d.256
+  // CHECK: @llvm.x86.avx2.pmadd.wd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_madd_epi16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_madd_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddw.d.256
+  // CHECK: @llvm.x86.avx2.pmadd.wd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_madd_epi16(__U, __A, __B); 
 }
 
@@ -1595,73 +1754,85 @@ __m128i test_mm256_maskz_cvtepi16_epi8(__mmask16 __M, __m256i __A) {
 
 __m128i test_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_mulhrs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmul.hr.sw.128
+  // CHECK: @llvm.x86.ssse3.pmul.hr.sw
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_mulhrs_epi16(__W, __U, __X, __Y); 
 }
 
 __m128i test_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_maskz_mulhrs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmul.hr.sw.128
+  // CHECK: @llvm.x86.ssse3.pmul.hr.sw
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_mulhrs_epi16(__U, __X, __Y); 
 }
 
 __m256i test_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_mulhrs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmul.hr.sw.256
+  // CHECK: @llvm.x86.avx2.pmul.hr.sw
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_mulhrs_epi16(__W, __U, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_mulhrs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmul.hr.sw.256
+  // CHECK: @llvm.x86.avx2.pmul.hr.sw
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_mulhrs_epi16(__U, __X, __Y); 
 }
 
 __m128i test_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_mulhi_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmulhu.w.128
+  // CHECK: @llvm.x86.sse2.pmulhu.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_mulhi_epu16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_mulhi_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmulhu.w.128
+  // CHECK: @llvm.x86.sse2.pmulhu.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_mulhi_epu16(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_mulhi_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmulhu.w.256
+  // CHECK: @llvm.x86.avx2.pmulhu.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_mulhi_epu16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_mulhi_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmulhu.w.256
+  // CHECK: @llvm.x86.avx2.pmulhu.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_mulhi_epu16(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_mulhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmulh.w.128
+  // CHECK: @llvm.x86.sse2.pmulh.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_mulhi_epi16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_mulhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmulh.w.128
+  // CHECK: @llvm.x86.sse2.pmulh.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_mulhi_epi16(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_mulhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmulh.w.256
+  // CHECK: @llvm.x86.avx2.pmulh.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_mulhi_epi16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_mulhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmulh.w.256
+  // CHECK: @llvm.x86.avx2.pmulh.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_mulhi_epi16(__U, __A, __B); 
 }
 
@@ -1779,256 +1950,348 @@ __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 
 __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.128
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i16>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_cvtepi8_epi16(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.128
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i16>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_cvtepi8_epi16(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask32 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.256
+  // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_cvtepi8_epi16(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.256
+  // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_cvtepi8_epi16(__U, __A); 
 }
 
-__m128i test_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
-  return _mm_mask_cvtepu8_epi32(__W, __U, __A); 
-}
-
-__m128i test_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
-  return _mm_maskz_cvtepu8_epi32(__U, __A); 
-}
-
-__m256i test_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
-  return _mm256_mask_cvtepu8_epi32(__W, __U, __A); 
-}
-
-__m256i test_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
-  return _mm256_maskz_cvtepu8_epi32(__U, __A); 
-}
-
-__m128i test_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
-  return _mm_mask_cvtepu8_epi64(__W, __U, __A); 
-}
-
-__m128i test_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
-  return _mm_maskz_cvtepu8_epi64(__U, __A); 
-}
-
-__m256i test_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
-  return _mm256_mask_cvtepu8_epi64(__W, __U, __A); 
-}
-
-__m256i test_mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
-  return _mm256_maskz_cvtepu8_epi64(__U, __A); 
-}
-
-__m128i test_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
-  // CHECK-LABEL: @test_mm_mask_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
-  return _mm_mask_cvtepu32_epi64(__W, __U, __X); 
-}
-
-__m128i test_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
-  return _mm_maskz_cvtepu32_epi64(__U, __X); 
-}
-
-__m256i test_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
-  return _mm256_mask_cvtepu32_epi64(__W, __U, __X); 
-}
-
-__m256i test_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepu32_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
-  return _mm256_maskz_cvtepu32_epi64(__U, __X); 
-}
-
-__m128i test_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
-  return _mm_mask_cvtepu16_epi32(__W, __U, __A); 
-}
-
-__m128i test_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
-  return _mm_maskz_cvtepu16_epi32(__U, __A); 
-}
-
-__m256i test_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
-  return _mm256_mask_cvtepu16_epi32(__W, __U, __A); 
-}
-
-__m256i test_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
-  return _mm256_maskz_cvtepu16_epi32(__U, __A); 
-}
-
-__m128i test_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
-  return _mm_mask_cvtepu16_epi64(__W, __U, __A); 
-}
-
-__m128i test_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
-  return _mm_maskz_cvtepu16_epi64(__U, __A); 
-}
-
-__m256i test_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
-  return _mm256_mask_cvtepu16_epi64(__W, __U, __A); 
-}
-
-__m256i test_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
-  return _mm256_maskz_cvtepu16_epi64(__U, __A); 
-}
-
 __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.128
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i16>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_cvtepu8_epi16(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.128
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i16>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_cvtepu8_epi16(__U, __A); 
 }
 
 __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask32 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.256
+  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_cvtepu8_epi16(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.256
+  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_cvtepu8_epi16(__U, __A); 
 }
 
 __m256i test_mm256_sllv_epi16(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.256(
   return _mm256_sllv_epi16(__A, __B); 
 }
 
 __m256i test_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.256(
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_sllv_epi16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.256(
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_sllv_epi16(__U, __A, __B); 
 }
 
 __m128i test_mm_sllv_epi16(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.128(
   return _mm_sllv_epi16(__A, __B); 
 }
 
 __m128i test_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.128(
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_sllv_epi16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_sllv_epi16
-  // CHECK: @llvm.x86.avx512.mask.psllv
+  // CHECK: @llvm.x86.avx512.psllv.w.128(
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_sllv_epi16(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_sll_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.w.128
+  // CHECK: @llvm.x86.sse2.psll.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_sll_epi16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_sll_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_sll_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.w.128
+  // CHECK: @llvm.x86.sse2.psll.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_sll_epi16(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_mask_sll_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.w.256
+  // CHECK: @llvm.x86.avx2.psll.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_sll_epi16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_maskz_sll_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.w.256
+  // CHECK: @llvm.x86.avx2.psll.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_sll_epi16(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_slli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.wi.128
+  // CHECK: @llvm.x86.sse2.pslli.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_slli_epi16(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_slli_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_slli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.wi.128
+  // CHECK: @llvm.x86.sse2.pslli.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_slli_epi16(__U, __A, 5); 
 }
 
 __m256i test_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_slli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.wi.256
+  // CHECK: @llvm.x86.avx2.pslli.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_slli_epi16(__W, __U, __A, 5); 
 }
 
 __m256i test_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_slli_epi16
-  // CHECK: @llvm.x86.avx512.mask.psll.wi.256
+  // CHECK: @llvm.x86.avx2.pslli.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_slli_epi16(__U, __A, 5); 
 }
 
+__m256i test_mm256_srlv_epi16(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_srlv_epi16
+  // CHECK: @llvm.x86.avx512.psrlv.w.256(
+  return _mm256_srlv_epi16(__A, __B); 
+}
+
+__m256i test_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_srlv_epi16
+  // CHECK: @llvm.x86.avx512.psrlv.w.256(
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_srlv_epi16(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_srlv_epi16
+  // CHECK: @llvm.x86.avx512.psrlv.w.256(
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_srlv_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_srlv_epi16(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_srlv_epi16
+  // CHECK: @llvm.x86.avx512.psrlv.w.128(
+  return _mm_srlv_epi16(__A, __B); 
+}
+
+__m128i test_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_srlv_epi16
+  // CHECK: @llvm.x86.avx512.psrlv.w.128(
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_srlv_epi16(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_srlv_epi16
+  // CHECK: @llvm.x86.avx512.psrlv.w.128(
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_srlv_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_srl_epi16
+  // CHECK: @llvm.x86.sse2.psrl.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_srl_epi16(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_srl_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_srl_epi16
+  // CHECK: @llvm.x86.sse2.psrl.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_srl_epi16(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_srl_epi16
+  // CHECK: @llvm.x86.avx2.psrl.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_srl_epi16(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_srl_epi16
+  // CHECK: @llvm.x86.avx2.psrl.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_srl_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srli_epi16
+  // CHECK: @llvm.x86.sse2.psrli.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_srli_epi16(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srli_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srli_epi16
+  // CHECK: @llvm.x86.sse2.psrli.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_srli_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srli_epi16
+  // CHECK: @llvm.x86.avx2.psrli.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_srli_epi16(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srli_epi16
+  // CHECK: @llvm.x86.avx2.psrli.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_srli_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_srav_epi16(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_srav_epi16
+  // CHECK: @llvm.x86.avx512.psrav.w.256(
+  return _mm256_srav_epi16(__A, __B); 
+}
+
+__m256i test_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_srav_epi16
+  // CHECK: @llvm.x86.avx512.psrav.w.256(
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_srav_epi16(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_srav_epi16
+  // CHECK: @llvm.x86.avx512.psrav.w.256(
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_srav_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_srav_epi16(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_srav_epi16
+  // CHECK: @llvm.x86.avx512.psrav.w.128(
+  return _mm_srav_epi16(__A, __B); 
+}
+
+__m128i test_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_srav_epi16
+  // CHECK: @llvm.x86.avx512.psrav.w.128(
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_srav_epi16(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_srav_epi16
+  // CHECK: @llvm.x86.avx512.psrav.w.128(
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_srav_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sra_epi16
+  // CHECK: @llvm.x86.sse2.psra.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_sra_epi16(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sra_epi16
+  // CHECK: @llvm.x86.sse2.psra.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_sra_epi16(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sra_epi16
+  // CHECK: @llvm.x86.avx2.psra.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_sra_epi16(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sra_epi16
+  // CHECK: @llvm.x86.avx2.psra.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_sra_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srai_epi16
+  // CHECK: @llvm.x86.sse2.psrai.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_srai_epi16(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srai_epi16
+  // CHECK: @llvm.x86.sse2.psrai.w
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_srai_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srai_epi16
+  // CHECK: @llvm.x86.avx2.psrai.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_srai_epi16(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srai_epi16
+  // CHECK: @llvm.x86.avx2.psrai.w
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_srai_epi16(__U, __A, 5); 
+}
+
 __m128i test_mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_mov_epi16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
diff --git a/test/CodeGen/avx512vlcd-builtins.c b/test/CodeGen/avx512vlcd-builtins.c
index f69da039a4fb..643f24f1d22c 100644
--- a/test/CodeGen/avx512vlcd-builtins.c
+++ b/test/CodeGen/avx512vlcd-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vl -target-feature +avx512cd -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vl -target-feature +avx512cd -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/avx512vldq-builtins.c b/test/CodeGen/avx512vldq-builtins.c
index fa1e0623e193..6834b6f46b28 100644
--- a/test/CodeGen/avx512vldq-builtins.c
+++ b/test/CodeGen/avx512vldq-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
@@ -13,13 +11,15 @@ __m256i test_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
 
 __m256i test_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_mullo_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmull.q.256
+  // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return (__m256i) _mm256_mask_mullo_epi64 ( __W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_mullo_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmull.q.256
+  // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return (__m256i) _mm256_maskz_mullo_epi64 (__U, __A, __B);
 }
 
@@ -31,205 +31,247 @@ __m128i test_mm_mullo_epi64 (__m128i __A, __m128i __B) {
 
 __m128i test_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_mullo_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmull.q.128
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return (__m128i) _mm_mask_mullo_epi64 ( __W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_mullo_epi64
-  // CHECK: @llvm.x86.avx512.mask.pmull.q.128
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return (__m128i) _mm_maskz_mullo_epi64 (__U, __A, __B);
 }
 
 __m256d test_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_andnot_pd
-  // CHECK: @llvm.x86.avx512.mask.andn.pd.256
+  // CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_mask_andnot_pd ( __W, __U, __A, __B);
 }
 
 __m256d test_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_andnot_pd
-  // CHECK: @llvm.x86.avx512.mask.andn.pd.256
+  // CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_maskz_andnot_pd (__U, __A, __B);
 }
 
 __m128d test_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_andnot_pd
-  // CHECK: @llvm.x86.avx512.mask.andn.pd.128
+  // CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_mask_andnot_pd ( __W, __U, __A, __B);
 }
 
 __m128d test_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_andnot_pd
-  // CHECK: @llvm.x86.avx512.mask.andn.pd.128
+  // CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_maskz_andnot_pd (__U, __A, __B);
 }
 
 __m256 test_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_andnot_ps
-  // CHECK: @llvm.x86.avx512.mask.andn.ps.256
+  // CHECK: xor <8 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_mask_andnot_ps ( __W, __U, __A, __B);
 }
 
 __m256 test_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_andnot_ps
-  // CHECK: @llvm.x86.avx512.mask.andn.ps.256
+  // CHECK: xor <8 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_maskz_andnot_ps (__U, __A, __B);
 }
 
 __m128 test_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_andnot_ps
-  // CHECK: @llvm.x86.avx512.mask.andn.ps.128
+  // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_mask_andnot_ps ( __W, __U, __A, __B);
 }
 
 __m128 test_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_andnot_ps
-  // CHECK: @llvm.x86.avx512.mask.andn.ps.128
+  // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_maskz_andnot_ps (__U, __A, __B);
 }
 
 __m256d test_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_and_pd
-  // CHECK: @llvm.x86.avx512.mask.and.pd.256
+  // CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_mask_and_pd ( __W, __U, __A, __B);
 }
 
 __m256d test_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_and_pd
-  // CHECK: @llvm.x86.avx512.mask.and.pd.256
+  // CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_maskz_and_pd (__U, __A, __B);
 }
 
 __m128d test_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_and_pd
-  // CHECK: @llvm.x86.avx512.mask.and.pd.128
+  // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_mask_and_pd ( __W, __U, __A, __B);
 }
 
 __m128d test_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_and_pd
-  // CHECK: @llvm.x86.avx512.mask.and.pd.128
+  // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_maskz_and_pd (__U, __A, __B);
 }
 
 __m256 test_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_and_ps
-  // CHECK: @llvm.x86.avx512.mask.and.ps.256
+  // CHECK: and <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_mask_and_ps ( __W, __U, __A, __B);
 }
 
 __m256 test_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_and_ps
-  // CHECK: @llvm.x86.avx512.mask.and.ps.256
+  // CHECK: and <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_maskz_and_ps (__U, __A, __B);
 }
 
 __m128 test_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_and_ps
-  // CHECK: @llvm.x86.avx512.mask.and.ps.128
+  // CHECK: and <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_mask_and_ps ( __W, __U, __A, __B);
 }
 
 __m128 test_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_and_ps
-  // CHECK: @llvm.x86.avx512.mask.and.ps.128
+  // CHECK: and <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_maskz_and_ps (__U, __A, __B);
 }
 
 __m256d test_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_xor_pd
-  // CHECK: @llvm.x86.avx512.mask.xor.pd.256
+  // CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_mask_xor_pd ( __W, __U, __A, __B);
 }
 
 __m256d test_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_xor_pd
-  // CHECK: @llvm.x86.avx512.mask.xor.pd.256
+  // CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_maskz_xor_pd (__U, __A, __B);
 }
 
 __m128d test_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_xor_pd
-  // CHECK: @llvm.x86.avx512.mask.xor.pd.128
+  // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_mask_xor_pd ( __W, __U, __A, __B);
 }
 
 __m128d test_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_xor_pd
-  // CHECK: @llvm.x86.avx512.mask.xor.pd.128
+  // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_maskz_xor_pd (__U, __A, __B);
 }
 
 __m256 test_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_xor_ps
-  // CHECK: @llvm.x86.avx512.mask.xor.ps.256
+  // CHECK: xor <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_mask_xor_ps ( __W, __U, __A, __B);
 }
 
 __m256 test_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_xor_ps
-  // CHECK: @llvm.x86.avx512.mask.xor.ps.256
+  // CHECK: xor <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_maskz_xor_ps (__U, __A, __B);
 }
 
 __m128 test_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_xor_ps
-  // CHECK: @llvm.x86.avx512.mask.xor.ps.128
-    return (__m128) _mm_mask_xor_ps ( __W, __U, __A, __B);
+  // CHECK: xor <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return (__m128) _mm_mask_xor_ps ( __W, __U, __A, __B);
 }
 
 __m128 test_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_xor_ps
-  // CHECK: @llvm.x86.avx512.mask.xor.ps.128
+  // CHECK: xor <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_maskz_xor_ps (__U, __A, __B);
 }
 
 __m256d test_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_or_pd
-  // CHECK: @llvm.x86.avx512.mask.or.pd.256
+  // CHECK: or <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_mask_or_pd ( __W, __U, __A, __B);
 }
 
 __m256d test_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_or_pd
-  // CHECK: @llvm.x86.avx512.mask.or.pd.256
+  // CHECK: or <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_maskz_or_pd (__U, __A, __B);
 }
 
 __m128d test_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_or_pd
-  // CHECK: @llvm.x86.avx512.mask.or.pd.128
+  // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_mask_or_pd ( __W, __U, __A, __B);
 }
 
 __m128d test_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_or_pd
-  // CHECK: @llvm.x86.avx512.mask.or.pd.128
+  // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_maskz_or_pd (__U, __A, __B);
 }
 
 __m256 test_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_or_ps
-  // CHECK: @llvm.x86.avx512.mask.or.ps.256
+  // CHECK: or <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_mask_or_ps ( __W, __U, __A, __B);
 }
 
 __m256 test_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_or_ps
-  // CHECK: @llvm.x86.avx512.mask.or.ps.256
+  // CHECK: or <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_maskz_or_ps (__U, __A, __B);
 }
 
 __m128 test_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_or_ps
-  // CHECK: @llvm.x86.avx512.mask.or.ps.128
+  // CHECK: or <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_mask_or_ps ( __W, __U, __A, __B);
 }
 
 __m128 test_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_or_ps
-  // CHECK: @llvm.x86.avx512.mask.or.ps.128
+  // CHECK: or <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_maskz_or_ps(__U, __A, __B);
 }
 
@@ -950,73 +992,81 @@ __m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
 
 __m128d test_mm256_extractf64x2_pd(__m256d __A) {
   // CHECK-LABEL: @test_mm256_extractf64x2_pd
-  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf64x2_pd(__A, 1); 
 }
 
 __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_extractf64x2_pd
-  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); 
 }
 
 __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_maskz_extractf64x2_pd
-  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_maskz_extractf64x2_pd(__U, __A, 1); 
 }
 
 __m128i test_mm256_extracti64x2_epi64(__m256i __A) {
   // CHECK-LABEL: @test_mm256_extracti64x2_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti64x2_epi64(__A, 1); 
 }
 
 __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_extracti64x2_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); 
 }
 
 __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_extracti64x2_epi64
-  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); 
 }
 
 __m256d test_mm256_insertf64x2(__m256d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm256_insertf64x2
-  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_insertf64x2(__A, __B, 1); 
 }
 
 __m256d test_mm256_mask_insertf64x2(__m256d __W, __mmask8 __U, __m256d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm256_mask_insertf64x2
-  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_insertf64x2(__W, __U, __A, __B, 1); 
 }
 
 __m256d test_mm256_maskz_insertf64x2(__mmask8 __U, __m256d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm256_maskz_insertf64x2
-  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_insertf64x2(__U, __A, __B, 1); 
 }
 
 __m256i test_mm256_inserti64x2(__m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_inserti64x2
-  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_inserti64x2(__A, __B, 1); 
 }
 
 __m256i test_mm256_mask_inserti64x2(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_mask_inserti64x2
-  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_inserti64x2(__W, __U, __A, __B, 1); 
 }
 
 __m256i test_mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm256_maskz_inserti64x2
-  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_inserti64x2(__U, __A, __B, 1); 
 }
 
diff --git a/test/CodeGen/bitscan-builtins.c b/test/CodeGen/bitscan-builtins.c
index ae817e815749..71e49845f896 100644
--- a/test/CodeGen/bitscan-builtins.c
+++ b/test/CodeGen/bitscan-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 #include <immintrin.h>
 
 int test_bit_scan_forward(int a) {
diff --git a/test/CodeGen/block-with-perdefinedexpr.cpp b/test/CodeGen/block-with-perdefinedexpr.cpp
new file mode 100644
index 000000000000..890c2d635856
--- /dev/null
+++ b/test/CodeGen/block-with-perdefinedexpr.cpp
@@ -0,0 +1,87 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - -fblocks -triple x86_64-apple-darwin10 -std=c++11 | FileCheck %s
+
+void bar() {
+  // CHECK-DAG: @__FUNCTION__.___Z3barv_block_invoke = private unnamed_addr constant [17 x i8] c"bar_block_invoke\00", align 1
+  const char * (^block1)() = ^() {
+    return __FUNCTION__;
+  };
+  // CHECK-DAG: @__FUNCTION__.___Z3barv_block_invoke_2 = private unnamed_addr constant [19 x i8] c"bar_block_invoke_2\00", align 1
+  const char * (^block2)() = ^() {
+    return __FUNCTION__;
+  };
+}
+
+void baz() {
+  // CHECK-DAG: @__PRETTY_FUNCTION__.___Z3bazv_block_invoke = private unnamed_addr constant [24 x i8] c"void baz()_block_invoke\00", align 1
+  const char * (^block1)() = ^() {
+    return __PRETTY_FUNCTION__;
+  };
+  // CHECK-DAG: @__PRETTY_FUNCTION__.___Z3bazv_block_invoke_2 = private unnamed_addr constant [26 x i8] c"void baz()_block_invoke_2\00", align 1
+  const char * (^block2)() = ^() {
+    return __PRETTY_FUNCTION__;
+  };
+}
+
+namespace foonamespace {
+class Foo {
+public:
+  Foo() {
+    // CHECK-DAG: @__PRETTY_FUNCTION__.___ZN12foonamespace3FooC2Ev_block_invoke = private unnamed_addr constant [38 x i8] c"foonamespace::Foo::Foo()_block_invoke\00", align 1
+    const char * (^block1)() = ^() {
+      return __PRETTY_FUNCTION__;
+    };
+    // CHECK-DAG: @__PRETTY_FUNCTION__.___ZN12foonamespace3FooC2Ev_block_invoke_2 = private unnamed_addr constant [40 x i8] c"foonamespace::Foo::Foo()_block_invoke_2\00", align 1
+    const char * (^block2)() = ^() {
+      return __PRETTY_FUNCTION__;
+    };
+    // CHECK-DAG: @__func__.___ZN12foonamespace3FooC2Ev_block_invoke_3 = private unnamed_addr constant [19 x i8] c"Foo_block_invoke_3\00", align 1
+    const char * (^block3)() = ^() {
+      return __func__;
+    };
+    bar();
+    inside_lambda();
+  }
+  ~Foo() {
+    // CHECK-DAG: @__func__.___ZN12foonamespace3FooD2Ev_block_invoke = private unnamed_addr constant [18 x i8] c"~Foo_block_invoke\00", align 1
+    const char * (^block1)() = ^() {
+      return __func__;
+    };
+    // CHECK-DAG: @__PRETTY_FUNCTION__.___ZN12foonamespace3FooD2Ev_block_invoke_2 = private unnamed_addr constant [41 x i8] c"foonamespace::Foo::~Foo()_block_invoke_2\00", align 1
+    const char * (^block2)() = ^() {
+      return __PRETTY_FUNCTION__;
+    };
+  }
+  void bar() {
+    // CHECK-DAG: @__PRETTY_FUNCTION__.___ZN12foonamespace3Foo3barEv_block_invoke = private unnamed_addr constant [43 x i8] c"void foonamespace::Foo::bar()_block_invoke\00", align 1
+    const char * (^block1)() = ^() {
+      return __PRETTY_FUNCTION__;
+    };
+    // CHECK-DAG: @__PRETTY_FUNCTION__.___ZN12foonamespace3Foo3barEv_block_invoke_2 = private unnamed_addr constant [45 x i8] c"void foonamespace::Foo::bar()_block_invoke_2\00", align 1
+    const char * (^block2)() = ^() {
+      return __PRETTY_FUNCTION__;
+    };
+    // CHECK-DAG: @__func__.___ZN12foonamespace3Foo3barEv_block_invoke_3 = private unnamed_addr constant [19 x i8] c"bar_block_invoke_3\00", align 1
+    const char * (^block3)() = ^() {
+      return __func__;
+    };
+  }
+  void inside_lambda() {
+    auto lambda = []() {
+      // CHECK-DAG: @__PRETTY_FUNCTION__.___ZZN12foonamespace3Foo13inside_lambdaEvENKUlvE_clEv_block_invoke = private unnamed_addr constant [92 x i8] c"auto foonamespace::Foo::inside_lambda()::(anonymous class)::operator()() const_block_invoke\00", align 1
+      const char * (^block1)() = ^() {
+        return __PRETTY_FUNCTION__;
+      };
+      // CHECK-DAG: 	  @__PRETTY_FUNCTION__.___ZZN12foonamespace3Foo13inside_lambdaEvENKUlvE_clEv_block_invoke_2 = private unnamed_addr constant [94 x i8] c"auto foonamespace::Foo::inside_lambda()::(anonymous class)::operator()() const_block_invoke_2\00", align 1
+      const char * (^block2)() = ^() {
+        return __PRETTY_FUNCTION__;
+      };
+      // CHECK-DAG: @__func__.___ZZN12foonamespace3Foo13inside_lambdaEvENKUlvE_clEv_block_invoke_3 = private unnamed_addr constant [26 x i8] c"operator()_block_invoke_3\00", align 1
+      const char * (^block3)() = ^() {
+        return __func__;
+      };
+    };
+    lambda();
+  }
+};
+Foo f;
+}
diff --git a/test/CodeGen/blocks-opencl.cl b/test/CodeGen/blocks-opencl.cl
index 61c479b7b98e..12513331e9d4 100644
--- a/test/CodeGen/blocks-opencl.cl
+++ b/test/CodeGen/blocks-opencl.cl
@@ -5,7 +5,7 @@
 void dummy(float (^const op)(float)) {
 }
 
-// CHECK: i8 addrspace(3)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(3)* @.str, i32 0, i32 0)
+// CHECK: i8 addrspace(2)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(2)* @.str, i32 0, i32 0)
 
 kernel void test_block()
 {
diff --git a/test/CodeGen/bmi-builtins.c b/test/CodeGen/bmi-builtins.c
index b9e22f9b20f5..f78e3fdd4a17 100644
--- a/test/CodeGen/bmi-builtins.c
+++ b/test/CodeGen/bmi-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/bmi2-builtins.c b/test/CodeGen/bmi2-builtins.c
index b4e3fec79a47..3a5d5e756ddb 100644
--- a/test/CodeGen/bmi2-builtins.c
+++ b/test/CodeGen/bmi2-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s --check-prefix=B32
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s --check-prefix=B32
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/bool_test.c b/test/CodeGen/bool_test.c
index b48da3748e60..ead8854e284b 100644
--- a/test/CodeGen/bool_test.c
+++ b/test/CodeGen/bool_test.c
@@ -1,5 +1,5 @@
 // REQUIRES: powerpc-registered-target
-// RUN: %clang_cc1 -triple powerpc-apple-macosx10.4.0 -emit-llvm -o - %s -O2 -disable-llvm-optzns | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-apple-macosx10.4.0 -emit-llvm -o - %s -O2 -disable-llvm-passes | FileCheck %s
 
 int boolsize = sizeof(_Bool);
 // CHECK: boolsize = global i32 4, align 4
diff --git a/test/CodeGen/builtin-clflushopt.c b/test/CodeGen/builtin-clflushopt.c
index 7248d378a4e1..e98c2aaba573 100644
--- a/test/CodeGen/builtin-clflushopt.c
+++ b/test/CodeGen/builtin-clflushopt.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +clflushopt  -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +clflushopt  -emit-llvm -o - -Wall -Werror | FileCheck %s
 #define __MM_MALLOC_H
 
 #include <immintrin.h>
diff --git a/test/CodeGen/builtin-expect.c b/test/CodeGen/builtin-expect.c
index 560625ed2439..2d4970021732 100644
--- a/test/CodeGen/builtin-expect.c
+++ b/test/CodeGen/builtin-expect.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O1 -disable-llvm-optzns | FileCheck %s --check-prefix=ALL --check-prefix=O1
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O1 -disable-llvm-passes | FileCheck %s --check-prefix=ALL --check-prefix=O1
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O0 | FileCheck %s --check-prefix=ALL --check-prefix=O0
 
 // In all tests, make sure that no expect is generated if optimizations are off.
diff --git a/test/CodeGen/builtin-unpredictable.c b/test/CodeGen/builtin-unpredictable.c
index 653f231cad27..30709b0cac68 100644
--- a/test/CodeGen/builtin-unpredictable.c
+++ b/test/CodeGen/builtin-unpredictable.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-llvm-optzns -o - %s -O1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-llvm-passes -o - %s -O1 | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O0 | FileCheck %s --check-prefix=CHECK_O0
 
 // When optimizing, the builtin should be converted to metadata.
diff --git a/test/CodeGen/builtins-ms.c b/test/CodeGen/builtins-ms.c
index 0676e9df7a7d..a33b57de8ddf 100644
--- a/test/CodeGen/builtins-ms.c
+++ b/test/CodeGen/builtins-ms.c
@@ -1,9 +1,16 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - -fms-extensions -triple i686-pc-win32 | FileCheck %s
 
-// CHECK-LABEL: define void @test_alloca
+// CHECK-LABEL: define void @test_alloca(
 void capture(void *);
 void test_alloca(int n) {
   capture(_alloca(n));
-  // CHECK: %[[arg:.*]] = alloca i8, i32 %
+  // CHECK: %[[arg:.*]] = alloca i8, i32 %{{.*}}, align 16
+  // CHECK: call void @capture(i8* %[[arg]])
+}
+
+// CHECK-LABEL: define void @test_alloca_with_align(
+void test_alloca_with_align(int n) {
+  capture(__builtin_alloca_with_align(n, 64));
+  // CHECK: %[[arg:.*]] = alloca i8, i32 %{{.*}}, align 8
   // CHECK: call void @capture(i8* %[[arg]])
 }
diff --git a/test/CodeGen/builtins-nvptx.c b/test/CodeGen/builtins-nvptx.c
index cd21361140bc..b0d646a51fec 100644
--- a/test/CodeGen/builtins-nvptx.c
+++ b/test/CodeGen/builtins-nvptx.c
@@ -1,8 +1,12 @@
 // REQUIRES: nvptx-registered-target
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
-// RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
-// RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \
+// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \
+// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \
+// RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -191,8 +195,9 @@ __shared__ long long sll;
 
 // Check for atomic intrinsics
 // CHECK-LABEL: nvvm_atom
-__device__ void nvvm_atom(float *fp, float f, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l,
-                          long long *llp, long long ll) {
+__device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
+                          int i, unsigned int *uip, unsigned ui, long *lp,
+                          long l, long long *llp, long long ll) {
   // CHECK: atomicrmw add
   __nvvm_atom_add_gen_i(ip, i);
   // CHECK: atomicrmw add
@@ -280,6 +285,255 @@ __device__ void nvvm_atom(float *fp, float f, int *ip, int i, unsigned int *uip,
   // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32
   __nvvm_atom_dec_gen_ui(uip, ui);
 
+
+  //////////////////////////////////////////////////////////////////
+  // Atomics with scope (only supported on sm_60+).
+
+#if ERROR_CHECK || __CUDA_ARCH__ >= 600
+
+  // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature satom}}
+  __nvvm_atom_cta_add_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature satom}}
+  __nvvm_atom_cta_add_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature satom}}
+  __nvvm_atom_cta_add_gen_ll(&sll, ll);
+  // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature satom}}
+  __nvvm_atom_sys_add_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature satom}}
+  __nvvm_atom_sys_add_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature satom}}
+  __nvvm_atom_sys_add_gen_ll(&sll, ll);
+
+  // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature satom}}
+  __nvvm_atom_cta_add_gen_f(fp, f);
+  // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature satom}}
+  __nvvm_atom_cta_add_gen_d(dfp, df);
+  // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature satom}}
+  __nvvm_atom_sys_add_gen_f(fp, f);
+  // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature satom}}
+  __nvvm_atom_sys_add_gen_d(dfp, df);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature satom}}
+  __nvvm_atom_cta_xchg_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature satom}}
+  __nvvm_atom_cta_xchg_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature satom}}
+  __nvvm_atom_cta_xchg_gen_ll(&sll, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature satom}}
+  __nvvm_atom_sys_xchg_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature satom}}
+  __nvvm_atom_sys_xchg_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature satom}}
+  __nvvm_atom_sys_xchg_gen_ll(&sll, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature satom}}
+  __nvvm_atom_cta_max_gen_i(ip, i);
+  // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature satom}}
+  __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature satom}}
+  __nvvm_atom_cta_max_gen_l(&dl, l);
+  // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature satom}}
+  __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature satom}}
+  __nvvm_atom_cta_max_gen_ll(&sll, ll);
+  // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature satom}}
+  __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature satom}}
+  __nvvm_atom_sys_max_gen_i(ip, i);
+  // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature satom}}
+  __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature satom}}
+  __nvvm_atom_sys_max_gen_l(&dl, l);
+  // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature satom}}
+  __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature satom}}
+  __nvvm_atom_sys_max_gen_ll(&sll, ll);
+  // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature satom}}
+  __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature satom}}
+  __nvvm_atom_cta_min_gen_i(ip, i);
+  // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature satom}}
+  __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature satom}}
+  __nvvm_atom_cta_min_gen_l(&dl, l);
+  // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature satom}}
+  __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature satom}}
+  __nvvm_atom_cta_min_gen_ll(&sll, ll);
+  // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature satom}}
+  __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature satom}}
+  __nvvm_atom_sys_min_gen_i(ip, i);
+  // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature satom}}
+  __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature satom}}
+  __nvvm_atom_sys_min_gen_l(&dl, l);
+  // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature satom}}
+  __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature satom}}
+  __nvvm_atom_sys_min_gen_ll(&sll, ll);
+  // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature satom}}
+  __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature satom}}
+  __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i);
+  // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature satom}}
+  __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature satom}}
+  __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i);
+  // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature satom}}
+  __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature satom}}
+  __nvvm_atom_cta_and_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature satom}}
+  __nvvm_atom_cta_and_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature satom}}
+  __nvvm_atom_cta_and_gen_ll(&sll, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature satom}}
+  __nvvm_atom_sys_and_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature satom}}
+  __nvvm_atom_sys_and_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature satom}}
+  __nvvm_atom_sys_and_gen_ll(&sll, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature satom}}
+  __nvvm_atom_cta_or_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature satom}}
+  __nvvm_atom_cta_or_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature satom}}
+  __nvvm_atom_cta_or_gen_ll(&sll, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature satom}}
+  __nvvm_atom_sys_or_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature satom}}
+  __nvvm_atom_sys_or_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature satom}}
+  __nvvm_atom_sys_or_gen_ll(&sll, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature satom}}
+  __nvvm_atom_cta_xor_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature satom}}
+  __nvvm_atom_cta_xor_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature satom}}
+  __nvvm_atom_cta_xor_gen_ll(&sll, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature satom}}
+  __nvvm_atom_sys_xor_gen_i(ip, i);
+  // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature satom}}
+  __nvvm_atom_sys_xor_gen_l(&dl, l);
+  // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature satom}}
+  __nvvm_atom_sys_xor_gen_ll(&sll, ll);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature satom}}
+  __nvvm_atom_cta_cas_gen_i(ip, i, 0);
+  // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature satom}}
+  __nvvm_atom_cta_cas_gen_l(&dl, l, 0);
+  // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature satom}}
+  __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32
+  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature satom}}
+  __nvvm_atom_sys_cas_gen_i(ip, i, 0);
+  // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32
+  // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature satom}}
+  __nvvm_atom_sys_cas_gen_l(&dl, l, 0);
+  // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64
+  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature satom}}
+  __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
+#endif
+
   // CHECK: ret
 }
 
diff --git a/test/CodeGen/builtins-ppc-altivec.c b/test/CodeGen/builtins-ppc-altivec.c
index 1edf99f8681c..3b75cb49c3fe 100644
--- a/test/CodeGen/builtins-ppc-altivec.c
+++ b/test/CodeGen/builtins-ppc-altivec.c
@@ -45,6 +45,7 @@ unsigned short param_us;
 int param_i;
 unsigned int param_ui;
 float param_f;
+signed long long param_sll;
 
 int res_sc;
 int res_uc;
@@ -88,6 +89,43 @@ void test1() {
 // CHECK-NOALTIVEC: error: use of undeclared identifier 'vf'
 // CHECK-NOALTIVEC: vf = vec_abs(vf) 
 
+  vsc = vec_nabs(vsc);
+// CHECK: sub <16 x i8> zeroinitializer
+// CHECK: @llvm.ppc.altivec.vminsb
+// CHECK-LE: sub <16 x i8> zeroinitializer
+// CHECK-LE: @llvm.ppc.altivec.vminsb
+
+  vs = vec_nabs(vs);
+// CHECK: sub <8 x i16> zeroinitializer
+// CHECK: @llvm.ppc.altivec.vminsh
+// CHECK-LE: sub <8 x i16> zeroinitializer
+// CHECK-LE: @llvm.ppc.altivec.vminsh
+
+  vi = vec_nabs(vi);
+// CHECK: sub <4 x i32> zeroinitializer
+// CHECK: @llvm.ppc.altivec.vminsw
+// CHECK-LE: sub <4 x i32> zeroinitializer
+// CHECK-LE: @llvm.ppc.altivec.vminsw
+
+  res_vi = vec_neg(vi);
+// CHECK: sub <4 x i32> zeroinitializer, {{%[0-9]+}}
+// CHECK-LE: sub <4 x i32> zeroinitializer, {{%[0-9]+}}
+// CHECK-NOALTIVEC: error: use of undeclared identifier 'vi'
+// CHECK-NOALTIVEC: vi = vec_neg(vi);
+
+  res_vs = vec_neg(vs);
+// CHECK: sub <8 x i16> zeroinitializer, {{%[0-9]+}}
+// CHECK-LE: sub <8 x i16> zeroinitializer, {{%[0-9]+}}
+// CHECK-NOALTIVEC: error: use of undeclared identifier 'vs'
+// CHECK-NOALTIVEC: res_vs = vec_neg(vs);
+
+  res_vsc = vec_neg(vsc);
+// CHECK: sub <16 x i8> zeroinitializer, {{%[0-9]+}}
+// CHECK-LE: sub <16 x i8> zeroinitializer, {{%[0-9]+}}
+// CHECK-NOALTIVEC: error: use of undeclared identifier 'vsc'
+// CHECK-NOALTIVEC: res_vsc = vec_neg(vsc);
+
+
   /* vec_abs */
   vsc = vec_abss(vsc);
 // CHECK: @llvm.ppc.altivec.vsubsbs
@@ -184,6 +222,22 @@ void test1() {
 // CHECK: fadd <4 x float>
 // CHECK-LE: fadd <4 x float>
 
+  res_vi  = vec_adde(vi, vi, vi);
+// CHECK: and <4 x i32>
+// CHECK: add <4 x i32>
+// CHECK: add <4 x i32>
+// CHECK-LE: and <4 x i32>
+// CHECK-LE: add <4 x i32>
+// CHECK-LE: add <4 x i32>
+
+  res_vui  = vec_adde(vui, vui, vui);
+// CHECK: and <4 x i32>
+// CHECK: add <4 x i32>
+// CHECK: add <4 x i32>
+// CHECK-LE: and <4 x i32>
+// CHECK-LE: add <4 x i32>
+// CHECK-LE: add <4 x i32>
+
   res_vsc = vec_vaddubm(vsc, vsc);
 // CHECK: add <16 x i8>
 // CHECK-LE: add <16 x i8>
@@ -938,6 +992,14 @@ void test2() {
 // CHECK: @llvm.ppc.altivec.vcmpequb
 // CHECK-LE: @llvm.ppc.altivec.vcmpequb
 
+  res_vbc = vec_cmpeq(vbc, vbc);
+// CHECK: @llvm.ppc.altivec.vcmpequb
+// CHECK-LE: @llvm.ppc.altivec.vcmpequb
+
+  res_vbc = vec_cmpeq(vbc, vbc);
+// CHECK: @llvm.ppc.altivec.vcmpequb
+// CHECK-LE: @llvm.ppc.altivec.vcmpequb
+
   res_vbs = vec_cmpeq(vs, vs);
 // CHECK: @llvm.ppc.altivec.vcmpequh
 // CHECK-LE: @llvm.ppc.altivec.vcmpequh
@@ -946,6 +1008,14 @@ void test2() {
 // CHECK: @llvm.ppc.altivec.vcmpequh
 // CHECK-LE: @llvm.ppc.altivec.vcmpequh
 
+  res_vbs = vec_cmpeq(vbs, vbs);
+// CHECK: @llvm.ppc.altivec.vcmpequh
+// CHECK-LE: @llvm.ppc.altivec.vcmpequh
+
+  res_vbs = vec_cmpeq(vbs, vbs);
+// CHECK: @llvm.ppc.altivec.vcmpequh
+// CHECK-LE: @llvm.ppc.altivec.vcmpequh
+
   res_vbi = vec_cmpeq(vi, vi);
 // CHECK: @llvm.ppc.altivec.vcmpequw
 // CHECK-LE: @llvm.ppc.altivec.vcmpequw
@@ -954,6 +1024,14 @@ void test2() {
 // CHECK: @llvm.ppc.altivec.vcmpequw
 // CHECK-LE: @llvm.ppc.altivec.vcmpequw
 
+  res_vbi = vec_cmpeq(vbi, vbi);
+// CHECK: @llvm.ppc.altivec.vcmpequw
+// CHECK-LE: @llvm.ppc.altivec.vcmpequw
+
+  res_vbi = vec_cmpeq(vbi, vbi);
+// CHECK: @llvm.ppc.altivec.vcmpequw
+// CHECK-LE: @llvm.ppc.altivec.vcmpequw
+
   res_vbi = vec_cmpeq(vf, vf);
 // CHECK: @llvm.ppc.altivec.vcmpeqfp
 // CHECK-LE: @llvm.ppc.altivec.vcmpeqfp
@@ -1148,6 +1226,22 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.vctuxs
 // CHECK-LE: @llvm.ppc.altivec.vctuxs
 
+  res_vi = vec_signed(vf);
+// CHECK: fptosi <4 x float>
+// CHECK-LE: fptosi <4 x float>
+
+  res_vui = vec_unsigned(vf);
+// CHECK: fptoui <4 x float>
+// CHECK-LE: fptoui <4 x float>
+
+  res_vf = vec_float(vi);
+// CHECK: sitofp <4 x i32>
+// CHECK-LE: sitofp <4 x i32>
+
+  res_vf = vec_float(vui);
+// CHECK: uitofp <4 x i32>
+// CHECK-LE: uitofp <4 x i32>
+
   /* vec_div */
   res_vsc = vec_div(vsc, vsc);
 // CHECK: sdiv <16 x i8>
@@ -3499,6 +3593,79 @@ void test6() {
 // CHECK-LE: sub nsw i32 31
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
+  /* vec_sldw */
+  res_vsc = vec_sldw(vsc, vsc, 0);
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+  // CHECK: @llvm.ppc.altivec.vperm
+  // CHECK-LE: sub nsw i32 16
+  // CHECK-LE: sub nsw i32 17
+  // CHECK-LE: sub nsw i32 18
+  // CHECK-LE: sub nsw i32 31
+  // CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vuc = vec_sldw(vuc, vuc, 0);
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+  // CHECK: @llvm.ppc.altivec.vperm
+  // CHECK-LE: sub nsw i32 16
+  // CHECK-LE: sub nsw i32 17
+  // CHECK-LE: sub nsw i32 18
+  // CHECK-LE: sub nsw i32 31
+  // CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vi = vec_sldw(vi, vi, 0);
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+  // CHECK: @llvm.ppc.altivec.vperm
+  // CHECK-LE: sub nsw i32 16
+  // CHECK-LE: sub nsw i32 17
+  // CHECK-LE: sub nsw i32 18
+  // CHECK-LE: sub nsw i32 31
+  // CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vui = vec_sldw(vui, vui, 0);
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+  // CHECK: @llvm.ppc.altivec.vperm
+  // CHECK-LE: sub nsw i32 16
+  // CHECK-LE: sub nsw i32 17
+  // CHECK-LE: sub nsw i32 18
+  // CHECK-LE: sub nsw i32 31
+  // CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vs = vec_sldw(vs, vs, 0);
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+  // CHECK: @llvm.ppc.altivec.vperm
+  // CHECK-LE: sub nsw i32 16
+  // CHECK-LE: sub nsw i32 17
+  // CHECK-LE: sub nsw i32 18
+  // CHECK-LE: sub nsw i32 31
+  // CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vus = vec_sldw(vus, vus, 0);
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+  // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+  // CHECK: @llvm.ppc.altivec.vperm
+  // CHECK-LE: sub nsw i32 16
+  // CHECK-LE: sub nsw i32 17
+  // CHECK-LE: sub nsw i32 18
+  // CHECK-LE: sub nsw i32 31
+  // CHECK-LE: @llvm.ppc.altivec.vperm
+
   res_vsc = vec_vsldoi(vsc, vsc, 0);
 // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
 // CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
@@ -5159,6 +5326,8 @@ void test6() {
 // CHECK: fsub <4 x float>
 // CHECK-LE: fsub <4 x float>
 
+  
+
   res_vsc = vec_vsububm(vsc, vsc);
 // CHECK: sub <16 x i8>
 // CHECK-LE: sub <16 x i8>
@@ -5240,6 +5409,10 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.vsubcuw
 // CHECK-LE: @llvm.ppc.altivec.vsubcuw
 
+  res_vi = vec_subc(vi, vi);
+// CHECK: @llvm.ppc.altivec.vsubcuw
+// CHECK-LE: @llvm.ppc.altivec.vsubcuw
+
   res_vui = vec_vsubcuw(vui, vui);
 // CHECK: @llvm.ppc.altivec.vsubcuw
 // CHECK-LE: @llvm.ppc.altivec.vsubcuw
@@ -5317,6 +5490,26 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.vsubuws
 // CHECK-LE: @llvm.ppc.altivec.vsubuws
 
+  res_vi = vec_sube(vi, vi, vi);
+// CHECK: and <4 x i32>
+// CHECK: xor <4 x i32> {{%[0-9]+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK: add <4 x i32>
+// CHECK: add <4 x i32>
+// CHECK-LE: and <4 x i32>
+// CHECK-LE: xor <4 x i32> {{%[0-9]+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-LE: add <4 x i32>
+// CHECK-LE: add <4 x i32>
+
+  res_vui = vec_sube(vui, vui, vui);
+// CHECK: and <4 x i32>
+// CHECK: xor <4 x i32> {{%[0-9]+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK: add <4 x i32>
+// CHECK: add <4 x i32>
+// CHECK-LE: and <4 x i32>
+// CHECK-LE: xor <4 x i32> {{%[0-9]+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-LE: add <4 x i32>
+// CHECK-LE: add <4 x i32>
+
   res_vsc = vec_vsubsbs(vsc, vsc);
 // CHECK: @llvm.ppc.altivec.vsubsbs
 // CHECK-LE: @llvm.ppc.altivec.vsubsbs
@@ -8996,3 +9189,274 @@ void test7() {
 // CHECK: @llvm.ppc.altivec.vcmpgefp.p(i32 2
 // CHECK-LE: @llvm.ppc.altivec.vcmpgefp.p(i32 2
 }
+
+/* ------------------------------ optional ---------------------------------- */
+void test8() {
+// CHECK-LABEL: define void @test8
+// CHECK-LE-LABEL: define void @test8
+  res_vbc = vec_reve(vbc);
+  // CHECK: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  res_vsc = vec_reve(vsc);
+  // CHECK: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  res_vuc = vec_reve(vuc);
+  // CHECK: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  res_vbi = vec_reve(vbi);
+  // CHECK: shufflevector <4 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <4 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  res_vi = vec_reve(vi);
+  // CHECK: shufflevector <4 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <4 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  res_vui = vec_reve(vui);
+  // CHECK: shufflevector <4 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <4 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  res_vbs = vec_reve(vbs);
+  // CHECK: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  res_vbs = vec_reve(vs);
+  // CHECK: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  res_vbs = vec_reve(vus);
+  // CHECK: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  res_vf = vec_reve(vf);
+  // CHECK: shufflevector <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-LE: shufflevector <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  res_vbc = vec_revb(vbc);
+// CHECK: [[T1:%.+]] = load <16 x i8>, <16 x i8>* @vbc, align 16
+// CHECK: store <16 x i8> [[T1]], <16 x i8>* [[T2:%.+]], align 16
+// CHECK: [[T3:%.+]] = load <16 x i8>, <16 x i8>* [[T2]], align 16
+// CHECK: store <16 x i8> [[T3]], <16 x i8>* @res_vbc, align 16
+// CHECK-LE: [[T1:%.+]] = load <16 x i8>, <16 x i8>* @vbc, align 16
+// CHECK-LE: store <16 x i8> [[T1]], <16 x i8>* [[T2:%.+]], align 16
+// CHECK-LE: [[T3:%.+]] = load <16 x i8>, <16 x i8>* [[T2]], align 16
+// CHECK-LE: store <16 x i8> [[T3]], <16 x i8>* @res_vbc, align 16
+
+  res_vsc = vec_revb(vsc);
+// CHECK: [[T1:%.+]] = load <16 x i8>, <16 x i8>* @vsc, align 16
+// CHECK: store <16 x i8> [[T1]], <16 x i8>* [[T2:%.+]], align 16
+// CHECK: [[T3:%.+]] = load <16 x i8>, <16 x i8>* [[T2]], align 16
+// CHECK: store <16 x i8> [[T3]], <16 x i8>* @res_vsc, align 16
+// CHECK-LE: [[T1:%.+]] = load <16 x i8>, <16 x i8>* @vsc, align 16
+// CHECK-LE: store <16 x i8> [[T1]], <16 x i8>* [[T2:%.+]], align 16
+// CHECK-LE: [[T3:%.+]] = load <16 x i8>, <16 x i8>* [[T2]], align 16
+// CHECK-LE: store <16 x i8> [[T3]], <16 x i8>* @res_vsc, align 16
+
+  res_vuc = vec_revb(vuc);
+// CHECK: [[T1:%.+]] = load <16 x i8>, <16 x i8>* @vuc, align 16
+// CHECK: store <16 x i8> [[T1]], <16 x i8>* [[T2:%.+]], align 16
+// CHECK: [[T3:%.+]] = load <16 x i8>, <16 x i8>* [[T2]], align 16
+// CHECK: store <16 x i8> [[T3]], <16 x i8>* @res_vuc, align 16
+// CHECK-LE: [[T1:%.+]] = load <16 x i8>, <16 x i8>* @vuc, align 16
+// CHECK-LE: store <16 x i8> [[T1]], <16 x i8>* [[T2:%.+]], align 16
+// CHECK-LE: [[T3:%.+]] = load <16 x i8>, <16 x i8>* [[T2]], align 16
+// CHECK-LE: store <16 x i8> [[T3]], <16 x i8>* @res_vuc, align 16
+
+  res_vbs = vec_revb(vbs);
+// CHECK: store <16 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vs = vec_revb(vs);
+// CHECK: store <16 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vus = vec_revb(vus);
+// CHECK: store <16 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vbi = vec_revb(vbi);
+// CHECK: store <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vi = vec_revb(vi);
+// CHECK: store <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vui = vec_revb(vui);
+// CHECK: store <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vf = vec_revb(vf);
+// CHECK: store <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+}
+
+/* ------------------------------ vec_xl ------------------------------------ */
+void test9() {
+  // CHECK-LABEL: define void @test9
+  // CHECK-LE-LABEL: define void @test9
+  res_vsc = vec_xl(param_sll, &param_sc);
+  // CHECK: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 16
+
+  res_vuc = vec_xl(param_sll, &param_uc);
+  // CHECK: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 16
+
+  res_vs = vec_xl(param_sll, &param_s);
+  // CHECK: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 16
+
+  res_vus = vec_xl(param_sll, &param_us);
+  // CHECK: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 16
+
+  res_vi = vec_xl(param_sll, &param_i);
+  // CHECK: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 16
+
+  res_vui = vec_xl(param_sll, &param_ui);
+  // CHECK: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 16
+
+  res_vf = vec_xl(param_sll, &param_f);
+  // CHECK: load <4 x float>, <4 x float>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <4 x float>, <4 x float>* %{{[0-9]+}}, align 16
+}
+
+/* ------------------------------ vec_xst ----------------------------------- */
+void test10() {
+  // CHECK-LABEL: define void @test10
+  // CHECK-LE-LABEL: define void @test10
+  vec_xst(vsc, param_sll, &param_sc);
+  // CHECK: store <16 x i8> %{{[0-9]+}}, <16 x i8>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <16 x i8> %{{[0-9]+}}, <16 x i8>* %{{[0-9]+}}, align 16
+
+  vec_xst(vuc, param_sll, &param_uc);
+  // CHECK: store <16 x i8> %{{[0-9]+}}, <16 x i8>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <16 x i8> %{{[0-9]+}}, <16 x i8>* %{{[0-9]+}}, align 16
+
+  vec_xst(vs, param_sll, &param_s);
+  // CHECK: store <8 x i16> %{{[0-9]+}}, <8 x i16>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <8 x i16> %{{[0-9]+}}, <8 x i16>* %{{[0-9]+}}, align 16
+
+  vec_xst(vus, param_sll, &param_us);
+  // CHECK: store <8 x i16> %{{[0-9]+}}, <8 x i16>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <8 x i16> %{{[0-9]+}}, <8 x i16>* %{{[0-9]+}}, align 16
+
+  vec_xst(vi, param_sll, &param_i);
+  // CHECK: store <4 x i32> %{{[0-9]+}}, <4 x i32>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <4 x i32> %{{[0-9]+}}, <4 x i32>* %{{[0-9]+}}, align 16
+
+  vec_xst(vui, param_sll, &param_ui);
+  // CHECK: store <4 x i32> %{{[0-9]+}}, <4 x i32>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <4 x i32> %{{[0-9]+}}, <4 x i32>* %{{[0-9]+}}, align 16
+
+  vec_xst(vf, param_sll, &param_f);
+  // CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 16
+}
+
+/* ----------------------------- vec_xl_be ---------------------------------- */
+void test11() {
+  // CHECK-LABEL: define void @test11
+  // CHECK-LE-LABEL: define void @test11
+  res_vsc = vec_xl_be(param_sll, &param_sc);
+  // CHECK: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
+  // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+
+  res_vuc = vec_xl_be(param_sll, &param_uc);
+  // CHECK: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
+  // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+
+  res_vs = vec_xl_be(param_sll, &param_s);
+  // CHECK: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
+  // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+
+  res_vus = vec_xl_be(param_sll, &param_us);
+  // CHECK: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
+  // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+
+  res_vi = vec_xl_be(param_sll, &param_i);
+  // CHECK: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %{{[0-9]+}})
+
+  res_vui = vec_xl_be(param_sll, &param_ui);
+  // CHECK: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %{{[0-9]+}})
+
+  res_vf = vec_xl_be(param_sll, &param_f);
+  // CHECK: load <4 x float>, <4 x float>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %{{[0-9]+}})
+}
+
+/* ----------------------------- vec_xst_be --------------------------------- */
+void test12() {
+  // CHECK-LABEL: define void @test12
+  // CHECK-LE-LABEL: define void @test12
+  vec_xst_be(vsc, param_sll, &param_sc);
+  // CHECK: store <16 x i8> %{{[0-9]+}}, <16 x i8>* %{{[0-9]+}}, align 16
+  // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  // CHECK-LE: call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+  vec_xst_be(vuc, param_sll, &param_uc);
+  // CHECK: store <16 x i8> %{{[0-9]+}}, <16 x i8>* %{{[0-9]+}}, align 16
+  // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  // CHECK-LE: call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+  vec_xst_be(vs, param_sll, &param_s);
+  // CHECK: store <8 x i16> %{{[0-9]+}}, <8 x i16>* %{{[0-9]+}}, align 16
+  // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK-LE: call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+  vec_xst_be(vus, param_sll, &param_us);
+  // CHECK: store <8 x i16> %{{[0-9]+}}, <8 x i16>* %{{[0-9]+}}, align 16
+  // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK-LE: call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+  vec_xst_be(vi, param_sll, &param_i);
+  // CHECK: store <4 x i32> %{{[0-9]+}}, <4 x i32>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call void @llvm.ppc.vsx.stxvw4x.be(<4 x i32> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+  vec_xst_be(vui, param_sll, &param_ui);
+  // CHECK: store <4 x i32> %{{[0-9]+}}, <4 x i32>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call void @llvm.ppc.vsx.stxvw4x.be(<4 x i32> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+  vec_xst_be(vf, param_sll, &param_f);
+  // CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 16
+  // CHECK-LE: call void @llvm.ppc.vsx.stxvw4x.be(<4 x i32> %{{[0-9]+}}, i8* %{{[0-9]+}})
+}
diff --git a/test/CodeGen/builtins-ppc-crypto.c b/test/CodeGen/builtins-ppc-crypto.c
index 60bdc4982d92..eaf568b09fb9 100644
--- a/test/CodeGen/builtins-ppc-crypto.c
+++ b/test/CodeGen/builtins-ppc-crypto.c
@@ -108,6 +108,30 @@ vector unsigned long long test_vpermxord(void)
 // CHECK: @llvm.ppc.altivec.crypto.vpermxor
 }
 
+// CHECK-LABEL: test_vpermxorbc
+vector bool char test_vpermxorbc(vector bool char a,
+                                vector bool char b,
+                                vector bool char c) {
+  return vec_permxor(a, b, c);
+// CHECK: @llvm.ppc.altivec.crypto.vpermxor
+}
+
+// CHECK-LABEL: test_vpermxorsc
+vector signed char test_vpermxorsc(vector signed char a,
+                                   vector signed char b,
+                                   vector signed char c) {
+  return vec_permxor(a, b, c);
+// CHECK: @llvm.ppc.altivec.crypto.vpermxor
+}
+
+// CHECK-LABEL: test_vpermxoruc
+vector unsigned char test_vpermxoruc(vector unsigned char a,
+                                     vector unsigned char b,
+                                     vector unsigned char c) {
+  return vec_permxor(a, b, c);
+// CHECK: @llvm.ppc.altivec.crypto.vpermxor
+}
+
 // CHECK-LABEL: define <2 x i64> @test_vcipher
 vector unsigned long long test_vcipher(void)
 {
diff --git a/test/CodeGen/builtins-ppc-p8vector.c b/test/CodeGen/builtins-ppc-p8vector.c
index 096e3e1bb6b5..97a663c2f28f 100644
--- a/test/CodeGen/builtins-ppc-p8vector.c
+++ b/test/CodeGen/builtins-ppc-p8vector.c
@@ -73,13 +73,6 @@ void test1() {
 // CHECK-LE: call <2 x i64> @llvm.ppc.altivec.vmaxsd(<2 x i64> %{{[0-9]*}}, <2 x i64>
 // CHECK-PPC: error: call to 'vec_abs' is ambiguous
 
-  res_vd = vec_abs(vda);
-// CHECK: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
-// CHECK: store <2 x double> %{{.*}}, <2 x double>* @res_vd
-// CHECK-LE: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
-// CHECK-LE: store <2 x double> %{{.*}}, <2 x double>* @res_vd
-// CHECK-PPC: error: call to 'vec_abs' is ambiguous
-
   /* vec_add */
   res_vsll = vec_add(vsll, vsll);
 // CHECK: add <2 x i64>
@@ -136,6 +129,26 @@ void test1() {
 // CHECK-LE: @llvm.ppc.altivec.vperm
 // CHECK-PPC: warning: implicit declaration of function 'vec_mergee'
 
+  res_vbll = vec_mergee(vbll, vbll);
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vsll = vec_mergee(vsll, vsll);
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vull = vec_mergee(vull, vull);
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vf = vec_mergee(vfa, vfa);
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vd = vec_mergee(vda, vda);
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
   /* vec_mergeo */
   res_vbi = vec_mergeo(vbi, vbi);
 // CHECK: @llvm.ppc.altivec.vperm
@@ -151,6 +164,11 @@ void test1() {
 // CHECK-PPC: warning: implicit declaration of function 'vec_mergeo'
   
   /* vec_cmpeq */
+  res_vbll = vec_cmpeq(vbll, vbll);
+// CHECK: @llvm.ppc.altivec.vcmpequd
+// CHECK-LE: @llvm.ppc.altivec.vcmpequd
+// CHECK-PPC: error: call to 'vec_cmpeq' is ambiguous
+
   res_vbll = vec_cmpeq(vsll, vsll);
 // CHECK: @llvm.ppc.altivec.vcmpequd
 // CHECK-LE: @llvm.ppc.altivec.vcmpequd
@@ -205,15 +223,6 @@ void test1() {
 // CHECK-LE: call <2 x i64> @llvm.ppc.altivec.vcmpgtud(<2 x i64> %{{[0-9]*}}, <2 x i64> %{{[0-9]*}})
 // CHECK-PPC: error: call to 'vec_cmplt' is ambiguous
 
-  /* vec_double */
-  res_vd = vec_double(vsll);
-// CHECK: sitofp i64 {{.+}} to double
-// CHECK-BE: sitofp i64 {{.+}} to double
-
-  res_vd = vec_double(vull);
-// CHECK: uitofp i64 {{.+}} to double
-// CHECK-BE: uitofp i64 {{.+}} to double
-
   /* vec_eqv */
   res_vsc =  vec_eqv(vsc, vsc);
 // CHECK: [[T1:%.+]] = bitcast <16 x i8> {{.+}} to <4 x i32>
@@ -1266,6 +1275,12 @@ void test1() {
 // CHECK-LE: [[T1:%.+]] = and <4 x i32>
 // CHECK-LE: xor <4 x i32> [[T1]], <i32 -1, i32 -1, i32 -1, i32 -1>
 
+  res_vf = vec_nand(vfa, vfa);
+// CHECK: [[T1:%.+]] = and <4 x i32>
+// CHECK: xor <4 x i32> [[T1]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-LE: [[T1:%.+]] = and <4 x i32>
+// CHECK-LE: xor <4 x i32> [[T1]], <i32 -1, i32 -1, i32 -1, i32 -1>
+
   res_vsll = vec_nand(vsll, vsll);
 // CHECK: [[T1:%.+]] = and <2 x i64>
 // CHECK: xor <2 x i64> [[T1]], <i64 -1, i64 -1>
@@ -1284,6 +1299,12 @@ void test1() {
 // CHECK-LE: [[T1:%.+]] = and <2 x i64>
 // CHECK-LE: xor <2 x i64> [[T1]], <i64 -1, i64 -1>
 
+  res_vd = vec_nand(vda, vda);
+// CHECK: [[T1:%.+]] = and <2 x i64>
+// CHECK: xor <2 x i64> [[T1]], <i64 -1, i64 -1>
+// CHECK-LE: [[T1:%.+]] = and <2 x i64>
+// CHECK-LE: xor <2 x i64> [[T1]], <i64 -1, i64 -1>
+
   /* vec_orc */
   res_vsc = vec_orc(vsc, vsc);
 // CHECK: [[T1:%.+]] = xor <16 x i8> {{%.+}}, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -1412,6 +1433,18 @@ void test1() {
 // CHECK-LE: [[T1:%.+]] = xor <4 x i32> {{%.+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
 // CHECK-LE: or <4 x i32> {{%.+}}, [[T1]]
 
+  res_vf = vec_orc(vbi, vfa);
+// CHECK: [[T1:%.+]] = xor <4 x i32> {{%.+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK: or <4 x i32> {{%.+}}, [[T1]]
+// CHECK-LE: [[T1:%.+]] = xor <4 x i32> {{%.+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-LE: or <4 x i32> {{%.+}}, [[T1]]
+
+  res_vf = vec_orc(vfa, vbi);
+// CHECK: [[T1:%.+]] = xor <4 x i32> {{%.+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK: or <4 x i32> {{%.+}}, [[T1]]
+// CHECK-LE: [[T1:%.+]] = xor <4 x i32> {{%.+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-LE: or <4 x i32> {{%.+}}, [[T1]]
+
   res_vsll = vec_orc(vsll, vsll);
 // CHECK: [[T1:%.+]] = xor <2 x i64> {{%.+}}, <i64 -1, i64 -1>
 // CHECK: or <2 x i64> {{%.+}}, [[T1]]
@@ -1454,6 +1487,18 @@ void test1() {
 // CHECK-LE: [[T1:%.+]] = xor <2 x i64> {{%.+}}, <i64 -1, i64 -1>
 // CHECK-LE: or <2 x i64> {{%.+}}, [[T1]]
 
+  res_vd = vec_orc(vbll, vda);
+// CHECK: [[T1:%.+]] = xor <2 x i64> {{%.+}}, <i64 -1, i64 -1>
+// CHECK: or <2 x i64> {{%.+}}, [[T1]]
+// CHECK-LE: [[T1:%.+]] = xor <2 x i64> {{%.+}}, <i64 -1, i64 -1>
+// CHECK-LE: or <2 x i64> {{%.+}}, [[T1]]
+
+  res_vd = vec_orc(vda, vbll);
+// CHECK: [[T1:%.+]] = xor <2 x i64> {{%.+}}, <i64 -1, i64 -1>
+// CHECK: or <2 x i64> {{%.+}}, [[T1]]
+// CHECK-LE: [[T1:%.+]] = xor <2 x i64> {{%.+}}, <i64 -1, i64 -1>
+// CHECK-LE: or <2 x i64> {{%.+}}, [[T1]]
+
   /* vec_sub */
   res_vsll = vec_sub(vsll, vsll);
 // CHECK: sub <2 x i64>
@@ -1504,4 +1549,77 @@ void test1() {
 // CHECK: llvm.ppc.altivec.vbpermq
 // CHECK-LE: llvm.ppc.altivec.vbpermq
 // CHECK-PPC: warning: implicit declaration of function 'vec_bperm'
+
+  res_vsll = vec_neg(vsll);
+// CHECK: sub <2 x i64> zeroinitializer, {{%[0-9]+}}
+// CHECK-LE: sub <2 x i64> zeroinitializer, {{%[0-9]+}}
+// CHECK_PPC: call to 'vec_neg' is ambiguous
+
+
+}
+
+
+vector signed int test_vec_addec_signed (vector signed int a, vector signed int b, vector signed int c) {
+  return vec_addec(a, b, c);
+// CHECK-LABEL: @test_vec_addec_signed
+// CHECK: icmp slt i32 {{%[0-9]+}}, 4
+// CHECK: extractelement
+// CHECK: extractelement
+// CHECK: extractelement
+// CHECK: and i32 {{%[0-9]+}}, 1
+// CHECK: zext
+// CHECK: zext
+// CHECK: zext
+// CHECK: add i64
+// CHECK: add i64
+// CHECK: lshr i64
+// CHECK: and i64
+// CHECK: trunc i64 {{%[0-9]+}} to i32
+// CHECK: zext i32
+// CHECK: trunc i64 {{%[0-9]+}} to i32
+// CHECK: sext i32
+// CHECK: add nsw i32
+// CHECK: br label
+// CHECK: ret <4 x i32>
+
+}
+
+
+vector unsigned int test_vec_addec_unsigned (vector unsigned int a, vector unsigned int b, vector unsigned int c) {
+  return vec_addec(a, b, c);
+
+// CHECK-LABEL: @test_vec_addec_unsigned
+// CHECK: icmp slt i32 {{%[0-9]+}}, 4
+// CHECK: extractelement
+// CHECK: and i32
+// CHECK: extractelement
+// CHECK: zext i32
+// CHECK: extractelement
+// CHECK: zext i32
+// CHECK: zext i32
+// CHECK: add i64
+// CHECK: lshr i64
+// CHECK: and i64
+// CHECK: trunc i64 {{%[0-9]+}} to i32
+// CHECK: zext i32
+// CHECK: trunc i64 {{%[0-9]+}} to i32
+// CHECK: sext i32
+// CHECK: add nsw i32
+// CHECK: br label
+// CHECK: ret <4 x i32>
+}
+
+vector signed int test_vec_subec_signed (vector signed int a, vector signed int b, vector signed int c) {
+  return vec_subec(a, b, c);
+// CHECK-LABEL: @test_vec_subec_signed
+// CHECK: xor <4 x i32> {{%[0-9]+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK: ret <4 x i32>
+}
+
+vector unsigned int test_vec_subec_unsigned (vector unsigned int a, vector unsigned int b, vector unsigned int c) {
+  return vec_subec(a, b, c);
+
+// CHECK-LABEL: @test_vec_subec_unsigned
+// CHECK: xor <4 x i32> {{%[0-9]+}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK: ret <4 x i32>
 }
diff --git a/test/CodeGen/builtins-ppc-p9vector.c b/test/CodeGen/builtins-ppc-p9vector.c
new file mode 100644
index 000000000000..f70d2f9f1504
--- /dev/null
+++ b/test/CodeGen/builtins-ppc-p9vector.c
@@ -0,0 +1,1182 @@
+// REQUIRES: powerpc-registered-target
+// RUN: %clang_cc1 -faltivec -target-feature +power9-vector \
+// RUN:   -triple powerpc64-unknown-unknown -emit-llvm %s \
+// RUN:   -o - | FileCheck %s -check-prefix=CHECK-BE
+
+// RUN: %clang_cc1 -faltivec -target-feature +power9-vector \
+// RUN:   -triple powerpc64le-unknown-unknown -emit-llvm %s \
+// RUN:   -o - | FileCheck %s
+
+#include <altivec.h>
+
+vector signed char vsca, vscb;
+vector unsigned char vuca, vucb;
+vector bool char vbca, vbcb;
+vector signed short vssa, vssb;
+vector unsigned short vusa, vusb;
+vector bool short vbsa, vbsb;
+vector signed int vsia, vsib;
+vector unsigned int vuia, vuib;
+vector bool int vbia, vbib;
+vector signed long long vsla, vslb;
+vector unsigned long long vula, vulb;
+vector bool long long vbla, vblb;
+vector float vfa, vfb;
+vector double vda, vdb;
+vector unsigned __int128 vui128a, vui128b;
+vector signed __int128 vsi128a, vsi128b;
+
+float f[4] = { 23.4f, 56.7f, 89.0f, 12.3f };
+double d[2] = { 23.4, 56.7 };
+signed char sc[16] = { -8,  9, -10, 11, -12, 13, -14, 15,
+                        -0,  1,  -2,  3,  -4,  5,  -6,  7 };
+unsigned char uc[16] = { 8,  9, 10, 11, 12, 13, 14, 15,
+                          0,  1,  2,  3,  4,  5,  6,  7 };
+signed short ss[8] = { -1, 2, -3, 4, -5, 6, -7, 8 };
+unsigned short us[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+signed int si[4] = { -1, 2, -3, 4 };
+unsigned int ui[4] = { 0, 1, 2, 3 };
+signed long sl[2] = { -1L, 2L };
+unsigned long ul[2] = { 1L, 2L };
+signed long long sll[2] = { 1LL, 1LL };
+unsigned long long ull[2] = { -1LL, 1LL };
+signed __int128 sint128[1] = { -1 };
+unsigned __int128 uint128[1] = { 1 };
+
+unsigned test1(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 3
+// CHECK: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 3
+  return vec_first_match_index (vsca, vscb);
+}
+unsigned test2(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 3
+// CHECK: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 3
+  return vec_first_match_index (vuca, vucb);
+}
+unsigned test3(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 5
+// CHECK: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 5
+  return vec_first_match_index (vsia, vsib);
+}
+unsigned test4(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 5
+// CHECK: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 5
+  return vec_first_match_index (vuia, vuib);
+}
+unsigned test5(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 4
+// CHECK: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 4
+  return vec_first_match_index (vssa, vssb);
+}
+unsigned test6(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 4
+// CHECK: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 4
+  return vec_first_match_index (vusa, vusb);
+}
+unsigned test7(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK-BE: or <16 x i8>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK-BE: or <16 x i8>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 3
+// CHECK: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK: or <16 x i8>
+// CHECK: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK: or <16 x i8>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 3
+  return vec_first_match_or_eos_index (vsca, vscb);
+}
+unsigned test8(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK-BE: or <16 x i8>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK-BE: or <16 x i8>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 3
+// CHECK: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK: or <16 x i8>
+// CHECK: @llvm.ppc.altivec.vcmpequb(<16 x i8>
+// CHECK: or <16 x i8>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 3
+  return vec_first_match_or_eos_index (vuca, vucb);
+}
+unsigned test9(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK-BE: or <4 x i32>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK-BE: or <4 x i32>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 5
+// CHECK: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK: or <4 x i32>
+// CHECK: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK: or <4 x i32>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 5
+  return vec_first_match_or_eos_index (vsia, vsib);
+}
+unsigned test10(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK-BE: or <4 x i32>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK-BE: or <4 x i32>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 5
+// CHECK: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK: or <4 x i32>
+// CHECK: @llvm.ppc.altivec.vcmpequw(<4 x i32>
+// CHECK: or <4 x i32>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 5
+  return vec_first_match_or_eos_index (vuia, vuib);
+}
+unsigned test11(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK-BE: or <8 x i16>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK-BE: or <8 x i16>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 4
+// CHECK: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK: or <8 x i16>
+// CHECK: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK: or <8 x i16>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 4
+  return vec_first_match_or_eos_index (vssa, vssb);
+}
+unsigned test12(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK-BE: or <8 x i16>
+// CHECK-BE: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK-BE: or <8 x i16>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 4
+// CHECK: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK: or <8 x i16>
+// CHECK: @llvm.ppc.altivec.vcmpequh(<8 x i16>
+// CHECK: or <8 x i16>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 4
+  return vec_first_match_or_eos_index (vusa, vusb);
+}
+unsigned test13(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 3
+// CHECK: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 3
+  return vec_first_mismatch_index (vsca, vscb);
+}
+unsigned test14(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 3
+// CHECK: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 3
+  return vec_first_mismatch_index (vuca, vucb);
+}
+unsigned test15(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 5
+// CHECK: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 5
+  return vec_first_mismatch_index (vsia, vsib);
+}
+unsigned test16(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 5
+// CHECK: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 5
+  return vec_first_mismatch_index (vuia, vuib);
+}
+unsigned test17(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 4
+// CHECK: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 4
+  return vec_first_mismatch_index (vssa, vssb);
+}
+unsigned test18(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 4
+// CHECK: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 4
+  return vec_first_mismatch_index (vusa, vusb);
+}
+unsigned test19(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnezb(<16 x i8>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 3
+// CHECK: @llvm.ppc.altivec.vcmpnezb(<16 x i8>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 3
+  return vec_first_mismatch_or_eos_index (vsca, vscb);
+}
+unsigned test20(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnezb(<16 x i8>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 3
+// CHECK: @llvm.ppc.altivec.vcmpnezb(<16 x i8>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 3
+  return vec_first_mismatch_or_eos_index (vuca, vucb);
+}
+unsigned test21(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnezw(<4 x i32>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 5
+// CHECK: @llvm.ppc.altivec.vcmpnezw(<4 x i32>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 5
+  return vec_first_mismatch_or_eos_index (vsia, vsib);
+}
+unsigned test22(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnezw(<4 x i32>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 5
+// CHECK: @llvm.ppc.altivec.vcmpnezw(<4 x i32>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 5
+  return vec_first_mismatch_or_eos_index (vuia, vuib);
+}
+unsigned test23(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnezh(<8 x i16>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 4
+// CHECK: @llvm.ppc.altivec.vcmpnezh(<8 x i16>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 4
+  return vec_first_mismatch_or_eos_index (vssa, vssb);
+}
+unsigned test24(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnezh(<8 x i16>
+// CHECK-BE: @llvm.ctlz.v2i64(<2 x i64>
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: icmp eq i64 {{.*}}, 64
+// CHECK-BE: extractelement <2 x i64>
+// CHECK-BE: add i64 {{.*}}, 64
+// CHECK-BE: lshr i64 {{.*}}, 4
+// CHECK: @llvm.ppc.altivec.vcmpnezh(<8 x i16>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK: extractelement <2 x i64>
+// CHECK: icmp eq i64 {{.*}}, 64
+// CHECK: extractelement <2 x i64>
+// CHECK: add i64 {{.*}}, 64
+// CHECK: lshr i64 {{.*}}, 4
+  return vec_first_mismatch_or_eos_index (vusa, vusb);
+}
+vector bool char test25(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_cmpne (vbca, vbcb);
+}
+vector bool char test26(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_cmpne (vsca, vscb);
+}
+vector bool char test27(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.ppc.altivec.vcmpneb(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_cmpne (vuca, vucb);
+}
+vector bool int test28(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_cmpne (vbia, vbib);
+}
+vector bool int test29(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_cmpne (vsia, vsib);
+}
+vector bool int test30(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_cmpne (vuia, vuib);
+}
+vector bool long long test31(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequd(<2 x i64>
+// CHECK-BE: xor <2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.altivec.vcmpequd(<2 x i64>
+// CHECK: xor <2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_cmpne (vbla, vblb);
+}
+vector bool long long test32(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequd(<2 x i64>
+// CHECK-BE: xor <2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.altivec.vcmpequd(<2 x i64>
+// CHECK: xor <2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_cmpne (vsla, vslb);
+}
+vector bool long long test33(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequd(<2 x i64>
+// CHECK-BE: xor <2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.altivec.vcmpequd(<2 x i64>
+// CHECK: xor <2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_cmpne (vula, vulb);
+}
+vector bool short test34(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK-BE-NEXT: ret <8 x i16>
+// CHECK: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK-NEXT: ret <8 x i16>
+  return vec_cmpne (vbsa, vbsb);
+}
+vector bool short test35(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK-BE-NEXT: ret <8 x i16>
+// CHECK: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK-NEXT: ret <8 x i16>
+  return vec_cmpne (vssa, vssb);
+}
+vector bool short test36(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK-BE-NEXT: ret <8 x i16>
+// CHECK: @llvm.ppc.altivec.vcmpneh(<8 x i16>
+// CHECK-NEXT: ret <8 x i16>
+  return vec_cmpne (vusa, vusb);
+}
+vector bool long long test37(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpequd(<2 x i64>
+// CHECK-BE: xor <2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.altivec.vcmpequd(<2 x i64>
+// CHECK: xor <2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_cmpne (vda, vdb);
+}
+vector bool int test38(void) {
+// CHECK-BE: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.altivec.vcmpnew(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_cmpne (vfa, vfb);
+}
+vector signed char test39(void) {
+// CHECK-BE: @llvm.cttz.v16i8(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.cttz.v16i8(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_cnttz (vsca);
+}
+vector unsigned char test40(void) {
+// CHECK-BE: @llvm.cttz.v16i8(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.cttz.v16i8(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_cnttz (vuca);
+}
+vector signed int test41(void) {
+// CHECK-BE: @llvm.cttz.v4i32(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.cttz.v4i32(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_cnttz (vsia);
+}
+vector unsigned int test42(void) {
+// CHECK-BE: @llvm.cttz.v4i32(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.cttz.v4i32(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_cnttz (vuia);
+}
+vector signed long long test43(void) {
+// CHECK-BE: @llvm.cttz.v2i64(<2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_cnttz (vsla);
+}
+vector unsigned long long test44(void) {
+// CHECK-BE: @llvm.cttz.v2i64(<2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.cttz.v2i64(<2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_cnttz (vula);
+}
+vector signed short test45(void) {
+// CHECK-BE: @llvm.cttz.v8i16(<8 x i16>
+// CHECK-BE-NEXT: ret <8 x i16>
+// CHECK: @llvm.cttz.v8i16(<8 x i16>
+// CHECK-NEXT: ret <8 x i16>
+  return vec_cnttz (vssa);
+}
+vector unsigned short test46(void) {
+// CHECK-BE: @llvm.cttz.v8i16(<8 x i16>
+// CHECK-BE-NEXT: ret <8 x i16>
+// CHECK: @llvm.cttz.v8i16(<8 x i16>
+// CHECK-NEXT: ret <8 x i16>
+  return vec_cnttz (vusa);
+}
+vector unsigned char test47(void) {
+// CHECK-BE: @llvm.ctpop.v16i8(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.ctpop.v16i8(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_popcnt (vsca);
+}
+vector unsigned char test48(void) {
+// CHECK-BE: @llvm.ctpop.v16i8(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.ctpop.v16i8(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_popcnt (vuca);
+}
+vector unsigned int test49(void) {
+// CHECK-BE: @llvm.ctpop.v4i32(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ctpop.v4i32(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_popcnt (vsia);
+}
+vector unsigned int test50(void) {
+// CHECK-BE: @llvm.ctpop.v4i32(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ctpop.v4i32(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_popcnt (vuia);
+}
+vector unsigned long long test51(void) {
+// CHECK-BE: @llvm.ctpop.v2i64(<2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ctpop.v2i64(<2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_popcnt (vsla);
+}
+vector unsigned long long test52(void) {
+// CHECK-BE: @llvm.ctpop.v2i64(<2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ctpop.v2i64(<2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_popcnt (vula);
+}
+vector unsigned short test53(void) {
+// CHECK-BE: @llvm.ctpop.v8i16(<8 x i16>
+// CHECK-BE-NEXT: ret <8 x i16>
+// CHECK: @llvm.ctpop.v8i16(<8 x i16>
+// CHECK-NEXT: ret <8 x i16>
+  return vec_popcnt (vssa);
+}
+vector unsigned short test54(void) {
+// CHECK-BE: @llvm.ctpop.v8i16(<8 x i16>
+// CHECK-BE-NEXT: ret <8 x i16>
+// CHECK: @llvm.ctpop.v8i16(<8 x i16>
+// CHECK-NEXT: ret <8 x i16>
+  return vec_popcnt (vusa);
+}
+vector double test55(void) {
+// CHECK-BE: @llvm.ppc.vsx.xviexpdp(<2 x i64> %{{.+}}, <2 x i64>
+// CHECK-BE-NEXT: ret <2 x double>
+// CHECK: @llvm.ppc.vsx.xviexpdp(<2 x i64> %{{.+}}, <2 x i64>
+// CHECK-NEXT: ret <2 x double>
+  return vec_insert_exp (vda,vulb);
+}
+vector double test56(void) {
+// CHECK-BE: @llvm.ppc.vsx.xviexpdp(<2 x i64> %{{.+}}, <2 x i64>
+// CHECK-BE-NEXT: ret <2 x double>
+// CHECK: @llvm.ppc.vsx.xviexpdp(<2 x i64> %{{.+}}, <2 x i64>
+// CHECK-NEXT: ret <2 x double>
+  return vec_insert_exp (vula, vulb);
+}
+vector float test57(void) {
+// CHECK-BE: @llvm.ppc.vsx.xviexpsp(<4 x i32> %{{.+}}, <4 x i32>
+// CHECK-BE-NEXT: ret <4 x float>
+// CHECK: @llvm.ppc.vsx.xviexpsp(<4 x i32> %{{.+}}, <4 x i32>
+// CHECK-NEXT: ret <4 x float>
+  return vec_insert_exp (vfa,vuib);
+}
+vector float test58(void) {
+// CHECK-BE: @llvm.ppc.vsx.xviexpsp(<4 x i32> %{{.+}}, <4 x i32>
+// CHECK-BE-NEXT: ret <4 x float>
+// CHECK: @llvm.ppc.vsx.xviexpsp(<4 x i32> %{{.+}}, <4 x i32>
+// CHECK-NEXT: ret <4 x float>
+  return vec_insert_exp (vuia,vuib);
+}
+signed int test59(void) {
+// CHECK-BE: @llvm.ppc.altivec.vclzlsbb(<16 x i8>
+// CHECK-BE-NEXT: ret i32
+// CHECK-LE: @llvm.ppc.altivec.vctzlsbb(<16 x i8>
+// CHECK-LE-NEXT: ret i32
+  return vec_cntlz_lsbb (vuca);
+}
+signed int test60(void) {
+// CHECK-BE: @llvm.ppc.altivec.vclzlsbb(<16 x i8>
+// CHECK-BE-NEXT: ret i32
+// CHECK-LE: @llvm.ppc.altivec.vctzlsbb(<16 x i8>
+// CHECK-LE-NEXT: ret i32
+  return vec_cntlz_lsbb (vsca);
+}
+signed int test61(void) {
+// CHECK-BE: @llvm.ppc.altivec.vctzlsbb(<16 x i8>
+// CHECK-BE-NEXT: ret i32
+// CHECK-LE: @llvm.ppc.altivec.vclzlsbb(<16 x i8>
+// CHECK-LE-NEXT: ret i32
+  return vec_cnttz_lsbb (vsca);
+}
+signed int test62(void) {
+// CHECK-BE: @llvm.ppc.altivec.vctzlsbb(<16 x i8>
+// CHECK-BE-NEXT: ret i32
+// CHECK-LE: @llvm.ppc.altivec.vclzlsbb(<16 x i8>
+// CHECK-LE-NEXT: ret i32
+  return vec_cnttz_lsbb (vuca);
+}
+vector unsigned int test63(void) {
+// CHECK-BE: @llvm.ppc.altivec.vprtybw(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.altivec.vprtybw(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_parity_lsbb (vuia);
+}
+vector unsigned int test64(void) {
+// CHECK-BE: @llvm.ppc.altivec.vprtybw(<4 x i32>
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.altivec.vprtybw(<4 x i32>
+// CHECK-NEXT: ret <4 x i32>
+  return vec_parity_lsbb (vsia);
+}
+vector unsigned long long test65(void) {
+// CHECK-BE: @llvm.ppc.altivec.vprtybd(<2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.altivec.vprtybd(<2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_parity_lsbb (vula);
+}
+vector unsigned long long test66(void) {
+// CHECK-BE: @llvm.ppc.altivec.vprtybd(<2 x i64>
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.altivec.vprtybd(<2 x i64>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_parity_lsbb (vsla);
+}
+vector unsigned __int128 test67(void) {
+// CHECK-BE: @llvm.ppc.altivec.vprtybq(<1 x i128>
+// CHECK-BE-NEXT: ret <1 x i128>
+// CHECK: @llvm.ppc.altivec.vprtybq(<1 x i128>
+// CHECK-NEXT: ret <1 x i128>
+  return vec_parity_lsbb (vui128a);
+}
+vector unsigned __int128 test68(void) {
+// CHECK-BE: @llvm.ppc.altivec.vprtybq(<1 x i128>
+// CHECK-BE-NEXT: ret <1 x i128>
+// CHECK: @llvm.ppc.altivec.vprtybq(<1 x i128>
+// CHECK-NEXT: ret <1 x i128>
+  return vec_parity_lsbb (vsi128a);
+}
+vector unsigned char test69(void) {
+// CHECK-BE: call <16 x i8> @llvm.ppc.altivec.vabsdub(<16 x i8> {{.+}}, <16 x i8> {{.+}})
+// CHECK: call <16 x i8> @llvm.ppc.altivec.vabsdub(<16 x i8> {{.+}}, <16 x i8> {{.+}})
+  return vec_absd(vuca, vucb);
+}
+vector unsigned short test70(void) {
+// CHECK-BE: call <8 x i16> @llvm.ppc.altivec.vabsduh(<8 x i16> {{.+}}, <8 x i16> {{.+}})
+// CHECK: call <8 x i16> @llvm.ppc.altivec.vabsduh(<8 x i16> {{.+}}, <8 x i16> {{.+}})
+  return vec_absd(vusa, vusb);
+}
+vector unsigned int test71(void) {
+// CHECK-BE: call <4 x i32> @llvm.ppc.altivec.vabsduw(<4 x i32> {{.+}}, <4 x i32> {{.+}})
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vabsduw(<4 x i32> {{.+}}, <4 x i32> {{.+}})
+  return vec_absd(vuia, vuib);
+}
+vector unsigned char test72(void) {
+// CHECK-BE: @llvm.ppc.altivec.vslv(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.ppc.altivec.vslv(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_slv (vuca, vucb);
+}
+vector unsigned char test73(void) {
+// CHECK-BE: @llvm.ppc.altivec.vsrv(<16 x i8>
+// CHECK-BE-NEXT: ret <16 x i8>
+// CHECK: @llvm.ppc.altivec.vsrv(<16 x i8>
+// CHECK-NEXT: ret <16 x i8>
+  return vec_srv (vuca, vucb);
+}
+vector unsigned short test74(void) {
+// CHECK-BE: @llvm.ppc.vsx.xvcvsphp(<4 x float>
+// CHECK-BE: @llvm.ppc.vsx.xvcvsphp(<4 x float>
+// CHECK-BE: @llvm.ppc.altivec.vperm
+// CHECK: @llvm.ppc.vsx.xvcvsphp(<4 x float>
+// CHECK: @llvm.ppc.vsx.xvcvsphp(<4 x float>
+// CHECK: @llvm.ppc.altivec.vperm
+  return vec_pack_to_short_fp32(vfa, vfb);
+}
+vector unsigned int test75(void) {
+// CHECK-BE: @llvm.ppc.altivec.vrlwmi(<4 x i32
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.altivec.vrlwmi(<4 x i32
+// CHECK-NEXT: ret <4 x i32>
+  return vec_rlmi(vuia, vuia, vuia);
+}
+vector unsigned long long test76(void) {
+// CHECK-BE: @llvm.ppc.altivec.vrldmi(<2 x i64
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.altivec.vrldmi(<2 x i64
+// CHECK-NEXT: ret <2 x i64>
+  return vec_rlmi(vula, vula, vula);
+}
+vector unsigned int test77(void) {
+// CHECK-BE: @llvm.ppc.altivec.vrlwnm(<4 x i32
+// CHECK-BE: and <4 x i32
+// CHECK-BE: ret <4 x i32>
+// CHECK: @llvm.ppc.altivec.vrlwnm(<4 x i32
+// CHECK: and <4 x i32
+// CHECK: ret <4 x i32>
+  return vec_rlnm(vuia, vuia, vuia);
+}
+vector unsigned long long test78(void) {
+// CHECK-BE: @llvm.ppc.altivec.vrldnm(<2 x i64
+// CHECK-BE: and <2 x i64
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.altivec.vrldnm(<2 x i64
+// CHECK: and <2 x i64
+// CHECK-NEXT: ret <2 x i64>
+  return vec_rlnm(vula, vula, vula);
+}
+vector double test79(void) {
+// CHECK-BE: extractelement <4 x float>
+// CHECK-BE: fpext float
+// CHECK-BE: insertelement <2 x double>
+// CHECK-BE: extractelement <4 x float>
+// CHECK-BE: fpext float
+// CHECK-BE: insertelement <2 x double>
+// CHECK: extractelement <4 x float>
+// CHECK: fpext float
+// CHECK: insertelement <2 x double>
+// CHECK: extractelement <4 x float>
+// CHECK: fpext float
+// CHECK: insertelement <2 x double>
+  return vec_unpackh(vfa);
+}
+vector double test80(void) {
+// CHECK-BE: extractelement <4 x float>
+// CHECK-BE: fpext float
+// CHECK-BE: insertelement <2 x double>
+// CHECK-BE: extractelement <4 x float>
+// CHECK-BE: fpext float
+// CHECK-BE: insertelement <2 x double>
+// CHECK: extractelement <4 x float>
+// CHECK: fpext float
+// CHECK: insertelement <2 x double>
+// CHECK: extractelement <4 x float>
+// CHECK: fpext float
+// CHECK: insertelement <2 x double>
+  return vec_unpackl(vfa);
+}
+vector double test81(void) {
+  // CHECK: extractelement <2 x double>
+  // CHECK: fptrunc double
+  // CHECK: insertelement <4 x float>
+  // CHECK: extractelement <2 x double>
+  // CHECK: fptrunc double
+  // CHECK: insertelement <4 x float>
+  // CHECK: extractelement <2 x double>
+  // CHECK: fptrunc double
+  // CHECK: insertelement <4 x float>
+  // CHECK: extractelement <2 x double>
+  // CHECK: fptrunc double
+  // CHECK: insertelement <4 x float>
+  // CHECK-LE: extractelement <2 x double>
+  // CHECK-LE: fptrunc double
+  // CHECK-LE: insertelement <4 x float>
+  // CHECK-LE: extractelement <2 x double>
+  // CHECK-LE: fptrunc double
+  // CHECK-LE: insertelement <4 x float>
+  // CHECK-LE: extractelement <2 x double>
+  // CHECK-LE: fptrunc double
+  // CHECK-LE: insertelement <4 x float>
+  // CHECK-LE: extractelement <2 x double>
+  // CHECK-LE: fptrunc double
+  // CHECK-LE: insertelement <4 x float>
+  return vec_pack(vda, vdb);
+}
+vector unsigned int test82(void) {
+// CHECK-BE: @llvm.ppc.vsx.xvxexpsp(<4 x float> {{.+}})
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.vsx.xvxexpsp(<4 x float> {{.+}})
+// CHECK-NEXT: ret <4 x i32>
+  return vec_extract_exp(vfa);
+}
+vector unsigned long long test83(void) {
+// CHECK-BE: @llvm.ppc.vsx.xvxexpdp(<2 x double> {{.+}})
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.vsx.xvxexpdp(<2 x double> {{.+}})
+// CHECK-NEXT: ret <2 x i64>
+  return vec_extract_exp(vda);
+}
+vector unsigned int test84(void) {
+// CHECK-BE: @llvm.ppc.vsx.xvxsigsp(<4 x float> {{.+}})
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.vsx.xvxsigsp(<4 x float> {{.+}})
+// CHECK-NEXT: ret <4 x i32>
+  return vec_extract_sig(vfa);
+}
+vector unsigned long long test85(void) {
+// CHECK-BE: @llvm.ppc.vsx.xvxsigdp(<2 x double> {{.+}})
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.vsx.xvxsigdp(<2 x double> {{.+}})
+// CHECK-NEXT: ret <2 x i64>
+  return vec_extract_sig(vda);
+}
+vector bool int test86(void) {
+// CHECK-BE: @llvm.ppc.vsx.xvtstdcsp(<4 x float> {{.+}}, i32 127)
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.vsx.xvtstdcsp(<4 x float> {{.+}}, i32 127)
+// CHECK-NEXT: ret <4 x i32>
+   return vec_test_data_class(vfa, __VEC_CLASS_FP_NOT_NORMAL);
+}
+vector bool long long test87(void) {
+// CHECK-BE: @llvm.ppc.vsx.xvtstdcdp(<2 x double> {{.+}}, i32 127)
+// CHECK-BE_NEXT: ret <2 x i64
+// CHECK: @llvm.ppc.vsx.xvtstdcdp(<2 x double> {{.+}}, i32 127)
+// CHECK-NEXT: ret <2 x i64>
+  return vec_test_data_class(vda, __VEC_CLASS_FP_NOT_NORMAL);
+}
+vector unsigned char test88(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <16 x i8>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <16 x i8>
+  return vec_xl_len(uc,0);
+}
+vector signed char test89(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <16 x i8>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <16 x i8>
+  return vec_xl_len(sc,0);
+}
+vector unsigned short test90(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <8 x i16>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <8 x i16>
+  return vec_xl_len(us,0);
+}
+vector signed short test91(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <8 x i16>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <8 x i16>
+  return vec_xl_len(ss,0);
+}
+vector unsigned int test92(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT: ret <4 x i32>
+  return vec_xl_len(ui,0);
+}
+
+vector signed int test93(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT: ret <4 x i32>
+  return vec_xl_len(si,0);
+}
+
+vector float test94(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <4 x i32>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <4 x i32>
+  return vec_xl_len(f,0);
+}
+
+vector unsigned long long test95(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <2 x i64>
+  return vec_xl_len(ull,0);
+}
+ 
+vector signed long long test96(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <2 x i64>
+  return vec_xl_len(sll,0);
+}
+
+vector double test97(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <2 x i64>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <2 x i64>
+  return vec_xl_len(d,0);
+}
+
+vector unsigned __int128 test98(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <1 x i128>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <1 x i128>
+  return vec_xl_len(uint128,0);
+}
+
+vector signed __int128 test99(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-BE-NEXT-NEXT: ret <1 x i128>
+// CHECK: @llvm.ppc.vsx.lxvl(i8* %{{.+}}, i64
+// CHECK-NEXT-NEXT: ret <1 x i128>
+  return vec_xl_len(sint128,0);
+}
+
+void test100(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+  return vec_xst_len(vuca,uc,0);
+}
+
+void test101(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+  return vec_xst_len(vsca,sc,0);
+}
+
+void test102(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vusa,us,0);
+}
+
+void test103(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vssa,ss,0);
+}
+
+void test104(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vuia,ui,0);
+}
+
+void test105(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vsia,si,0);
+}
+
+void test106(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vfa,f,0);
+}
+
+void test107(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vula,ull,0);
+}
+
+void test108(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vsla,sll,0);
+}
+
+void test109(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vda,d,0);
+}
+
+void test110(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vui128a,uint128,0);
+}
+
+void test111(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.stxvl(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+ return vec_xst_len(vsi128a,sint128,0);
+}
+
+vector unsigned char test112(void) {
+// CHECK-BE: @llvm.ppc.vsx.lxvll(i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.vsx.lxvll(i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.altivec.lvsr(i8* %{{.+}}
+// CHECK: @llvm.ppc.altivec.vperm
+  return vec_xl_len_r(uc,0);
+}
+void test113(void) {
+// CHECK-BE: @llvm.ppc.vsx.stxvll(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+// CHECK: @llvm.ppc.altivec.lvsl(i8* %{{.+}}
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK: @llvm.ppc.vsx.stxvll(<4 x i32> %{{.+}}, i8* %{{.+}}, i64
+  return vec_xst_len_r(vuca,uc,0);
+}
+vector float test114(void) {
+// CHECK-BE: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
+// CHECK-BE: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
+// CHECK-BE-NEXT: ret <4 x float>
+// CHECK-LE: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
+// CHECK-LE: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
+// CHECK-LE-NEXT: ret <4 x float>
+  return vec_extract_fp32_from_shorth(vusa);
+}
+vector float test115(void) {
+// CHECK-BE: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
+// CHECK-BE: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
+// CHECK-BE-NEXT: ret <4 x float>
+// CHECK-LE: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
+// CHECK-LE: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
+// CHECK-LE-NEXT: ret <4 x float>
+  return vec_extract_fp32_from_shortl(vusa);
+}
diff --git a/test/CodeGen/builtins-ppc-quadword.c b/test/CodeGen/builtins-ppc-quadword.c
index f381642c422f..3e168c8b1be6 100644
--- a/test/CodeGen/builtins-ppc-quadword.c
+++ b/test/CodeGen/builtins-ppc-quadword.c
@@ -15,6 +15,12 @@ vector signed __int128 vlll = { -1 };
 // CHECK-PPC: error: __int128 is not supported on this target
 vector unsigned __int128 vulll = { 1 };
 
+signed long long param_sll;
+// CHECK-PPC: error: __int128 is not supported on this target
+signed __int128 param_lll;
+// CHECK-PPC: error: __int128 is not supported on this target
+unsigned __int128 param_ulll;
+
 // CHECK-PPC: error: __int128 is not supported on this target
 vector signed __int128 res_vlll;
 // CHECK-PPC: error: __int128 is not supported on this target
@@ -119,11 +125,32 @@ void test1() {
 // CHECK-LE: @llvm.ppc.altivec.vsubeuqm
 // CHECK-PPC: error: assigning to '__vector __int128' (vector of 1 '__int128' value) from incompatible type 'int'
   
+  /* vec_sube */
+  res_vlll = vec_sube(vlll, vlll, vlll);
+// CHECK: @llvm.ppc.altivec.vsubeuqm
+// CHECK-LE: @llvm.ppc.altivec.vsubeuqm
+// CHECK-PPC: error: call to 'vec_sube' is ambiguous
+  
+  res_vulll = vec_sube(vulll, vulll, vulll);
+// CHECK: @llvm.ppc.altivec.vsubeuqm
+// CHECK-LE: @llvm.ppc.altivec.vsubeuqm
+// CHECK-PPC: error: call to 'vec_sube' is ambiguous
+  
+  res_vlll = vec_sube(vlll, vlll, vlll);
+// CHECK: @llvm.ppc.altivec.vsubeuqm
+// CHECK-LE: @llvm.ppc.altivec.vsubeuqm
+// CHECK-PPC: error: call to 'vec_sube' is ambiguous
+  
   res_vulll = vec_vsubeuqm(vulll, vulll, vulll);
 // CHECK: @llvm.ppc.altivec.vsubeuqm
 // CHECK-LE: @llvm.ppc.altivec.vsubeuqm
 // CHECK-PPC: error: assigning to '__vector unsigned __int128' (vector of 1 'unsigned __int128' value) from incompatible type 'int'
   
+  res_vulll = vec_sube(vulll, vulll, vulll);
+// CHECK: @llvm.ppc.altivec.vsubeuqm
+// CHECK-LE: @llvm.ppc.altivec.vsubeuqm
+// CHECK-PPC: error: call to 'vec_sube' is ambiguous
+
   /* vec_subc */
   res_vlll = vec_subc(vlll, vlll);
 // CHECK: @llvm.ppc.altivec.vsubcuq
@@ -150,11 +177,73 @@ void test1() {
   res_vlll = vec_vsubecuq(vlll, vlll, vlll);
 // CHECK: @llvm.ppc.altivec.vsubecuq
 // CHECK-LE: @llvm.ppc.altivec.vsubecuq
-// CHECK-PPC: error: assigning to '__vector __int128' (vector of 1 '__int128' value) from incompatible type 'int'  
+// CHECK-PPC: error: assigning to '__vector __int128' (vector of 1 '__int128' value) from incompatible type 'int'
 
   res_vulll = vec_vsubecuq(vulll, vulll, vulll);
 // CHECK: @llvm.ppc.altivec.vsubecuq
 // CHECK-LE: @llvm.ppc.altivec.vsubecuq
+// CHECK-PPC: error: assigning to '__vector unsigned __int128' (vector of 1 'unsigned __int128' value) from incompatible type 'int'
+
+  res_vlll = vec_subec(vlll, vlll, vlll);
+// CHECK: @llvm.ppc.altivec.vsubecuq
+// CHECK-LE: @llvm.ppc.altivec.vsubecuq
+// CHECK-PPC: error: assigning to '__vector __int128' (vector of 1 '__int128' value) from incompatible type 'int'  
+
+  res_vulll = vec_subec(vulll, vulll, vulll);
+// CHECK: @llvm.ppc.altivec.vsubecuq
+// CHECK-LE: @llvm.ppc.altivec.vsubecuq
 // CHECK-PPC: error: assigning to '__vector unsigned __int128' (vector of 1 'unsigned __int128' value) from incompatible type 'int'  
 
+  res_vulll = vec_revb(vulll);
+// CHECK: store <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK_PPC: error: call to 'vec_revb' is ambiguous
+
+  /* vec_xl */
+  res_vlll = vec_xl(param_sll, &param_lll);
+  // CHECK: load <1 x i128>, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <1 x i128>, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-PPC: error: call to 'vec_xl' is ambiguous
+
+  res_vulll = vec_xl(param_sll, &param_ulll);
+  // CHECK: load <1 x i128>, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <1 x i128>, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-PPC: error: call to 'vec_xl' is ambiguous
+
+  /* vec_xst */
+   vec_xst(vlll, param_sll, &param_lll);
+  // CHECK: store <1 x i128> %{{[0-9]+}}, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <1 x i128> %{{[0-9]+}}, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-PPC: error: call to 'vec_xst' is ambiguous
+
+   vec_xst(vulll, param_sll, &param_ulll);
+  // CHECK: store <1 x i128> %{{[0-9]+}}, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <1 x i128> %{{[0-9]+}}, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-PPC: error: call to 'vec_xst' is ambiguous
+
+  /* vec_xl_be */
+  res_vlll = vec_xl_be(param_sll, &param_lll);
+  // CHECK: load <1 x i128>, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <1 x i128>, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-PPC: error: call to 'vec_xl' is ambiguous
+
+  res_vulll = vec_xl_be(param_sll, &param_ulll);
+  // CHECK: load <1 x i128>, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-LE: load <1 x i128>, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-PPC: error: call to 'vec_xl' is ambiguous
+
+  /* vec_xst_be  */
+   vec_xst_be(vlll, param_sll, &param_lll);
+  // CHECK: store <1 x i128> %{{[0-9]+}}, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <1 x i128> %{{[0-9]+}}, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-PPC: error: call to 'vec_xst' is ambiguous
+
+   vec_xst_be(vulll, param_sll, &param_ulll);
+  // CHECK: store <1 x i128> %{{[0-9]+}}, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-LE: store <1 x i128> %{{[0-9]+}}, <1 x i128>* %{{[0-9]+}}, align 16
+  // CHECK-PPC: error: call to 'vec_xst' is ambiguous
 }
diff --git a/test/CodeGen/builtins-ppc-vsx.c b/test/CodeGen/builtins-ppc-vsx.c
index e58afdd94dab..16c72c404d9c 100644
--- a/test/CodeGen/builtins-ppc-vsx.c
+++ b/test/CodeGen/builtins-ppc-vsx.c
@@ -21,6 +21,7 @@ vector bool long long vbll = { 1, 0 };
 vector signed long long vsll = { 255LL, -937LL };
 vector unsigned long long vull = { 1447LL, 2894LL };
 double d = 23.4;
+signed long long sll = 618LL;
 float af[4] = {23.4f, 56.7f, 89.0f, 12.3f};
 double ad[2] = {23.4, 56.7};
 signed char asc[16] = { -8,  9, -10, 11, -12, 13, -14, 15,
@@ -31,8 +32,8 @@ signed short ass[8] = { -1, 2, -3, 4, -5, 6, -7, 8 };
 unsigned short aus[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
 signed int asi[4] = { -1, 2, -3, 4 };
 unsigned int aui[4] = { 0, 1, 2, 3 };
-signed long asl[2] = { -1L, 2L };
-unsigned long aul[2] = { 1L, 2L };
+signed long long asll[2] = { -1L, 2L };
+unsigned long long aull[2] = { 1L, 2L };
 
 vector float res_vf;
 vector double res_vd;
@@ -69,6 +70,18 @@ void test1() {
 // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
 // CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
 
+  res_vd = vec_abs(vd);
+// CHECK: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{[0-9]*}})
+// CHECK-LE: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{[0-9]*}})
+
+  res_vf = vec_nabs(vf);
+// CHECK: [[VEC:%[0-9]+]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
+// CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VEC]]
+
+  res_vd = vec_nabs(vd);
+// CHECK: [[VECD:%[0-9]+]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{[0-9]*}})
+// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[VECD]]
+
   dummy();
 // CHECK: call void @dummy()
 // CHECK-LE: call void @dummy()
@@ -1080,4 +1093,602 @@ void test1() {
 // CHECK: fmul <2 x double>
 // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double>
 // CHECK-LE: fmul <2 x double>
+
+  res_vsll = vec_signed(vd);
+// CHECK: fptosi <2 x double>
+// CHECK-LE: fptosi <2 x double>
+
+  res_vsi = vec_signed2(vd, vd);
+// CHECK: extractelement <2 x double>
+// CHECK: fptosi double
+// CHECK: insertelement <4 x i32>
+// CHECK: extractelement <2 x double>
+// CHECK: fptosi double
+// CHECK: insertelement <4 x i32>
+// CHECK: extractelement <2 x double>
+// CHECK: fptosi double
+// CHECK: insertelement <4 x i32>
+// CHECK: extractelement <2 x double>
+// CHECK: fptosi double
+// CHECK: insertelement <4 x i32>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptosi double
+// CHECK-LE: insertelement <4 x i32>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptosi double
+// CHECK-LE: insertelement <4 x i32>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptosi double
+// CHECK-LE: insertelement <4 x i32>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptosi double
+// CHECK-LE: insertelement <4 x i32>
+
+  res_vsi = vec_signede(vd);
+// CHECK: @llvm.ppc.vsx.xvcvdpsxws(<2 x double>
+// CHECK-LE: @llvm.ppc.vsx.xvcvdpsxws(<2 x double>
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vsi = vec_signedo(vd);
+// CHECK: @llvm.ppc.vsx.xvcvdpsxws(<2 x double>
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.vsx.xvcvdpsxws(<2 x double>
+
+  res_vull = vec_unsigned(vd);
+// CHECK: fptoui <2 x double>
+// CHECK-LE: fptoui <2 x double>
+
+  res_vui = vec_unsigned2(vd, vd);
+// CHECK: extractelement <2 x double>
+// CHECK: fptoui double
+// CHECK: insertelement <4 x i32>
+// CHECK: extractelement <2 x double>
+// CHECK: fptoui double
+// CHECK: insertelement <4 x i32>
+// CHECK: extractelement <2 x double>
+// CHECK: fptoui double
+// CHECK: insertelement <4 x i32>
+// CHECK: extractelement <2 x double>
+// CHECK: fptoui double
+// CHECK: insertelement <4 x i32>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptoui double
+// CHECK-LE: insertelement <4 x i32>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptoui double
+// CHECK-LE: insertelement <4 x i32>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptoui double
+// CHECK-LE: insertelement <4 x i32>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptoui double
+// CHECK-LE: insertelement <4 x i32>
+
+  res_vui = vec_unsignede(vd);
+// CHECK: @llvm.ppc.vsx.xvcvdpuxws(<2 x double>
+// CHECK-LE: @llvm.ppc.vsx.xvcvdpuxws(<2 x double>
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vui = vec_unsignedo(vd);
+// CHECK: @llvm.ppc.vsx.xvcvdpuxws(<2 x double>
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.vsx.xvcvdpuxws(<2 x double>
+
+  res_vf = vec_float2(vsll, vsll);
+// CHECK: extractelement <2 x i64>
+// CHECK: sitofp i64
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x i64>
+// CHECK: sitofp i64
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x i64>
+// CHECK: sitofp i64
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x i64>
+// CHECK: sitofp i64
+// CHECK: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x i64>
+// CHECK-LE: sitofp i64
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x i64>
+// CHECK-LE: sitofp i64
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x i64>
+// CHECK-LE: sitofp i64
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x i64>
+// CHECK-LE: sitofp i64
+// CHECK-LE: insertelement <4 x float>
+
+  res_vf = vec_float2(vull, vull);
+// CHECK: extractelement <2 x i64>
+// CHECK: uitofp i64
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x i64>
+// CHECK: uitofp i64
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x i64>
+// CHECK: uitofp i64
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x i64>
+// CHECK: uitofp i64
+// CHECK: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x i64>
+// CHECK-LE: uitofp i64
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x i64>
+// CHECK-LE: uitofp i64
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x i64>
+// CHECK-LE: uitofp i64
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x i64>
+// CHECK-LE: uitofp i64
+// CHECK-LE: insertelement <4 x float>
+
+  res_vf = vec_float2(vd, vd);
+// CHECK: extractelement <2 x double>
+// CHECK: fptrunc double
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x double>
+// CHECK: fptrunc double
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x double>
+// CHECK: fptrunc double
+// CHECK: insertelement <4 x float>
+// CHECK: extractelement <2 x double>
+// CHECK: fptrunc double
+// CHECK: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptrunc double
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptrunc double
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptrunc double
+// CHECK-LE: insertelement <4 x float>
+// CHECK-LE: extractelement <2 x double>
+// CHECK-LE: fptrunc double
+// CHECK-LE: insertelement <4 x float>
+
+  res_vf = vec_floate(vsll);
+// CHECK: @llvm.ppc.vsx.xvcvsxdsp
+// CHECK-LE: @llvm.ppc.vsx.xvcvsxdsp
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vf = vec_floate(vull);
+// CHECK: @llvm.ppc.vsx.xvcvuxdsp
+// CHECK-LE: @llvm.ppc.vsx.xvcvuxdsp
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vf = vec_floate(vd);
+// CHECK: @llvm.ppc.vsx.xvcvdpsp
+// CHECK-LE: @llvm.ppc.vsx.xvcvdpsp
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vf = vec_floato(vsll);
+// CHECK: @llvm.ppc.vsx.xvcvsxdsp
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.vsx.xvcvsxdsp
+
+  res_vf = vec_floato(vull);
+// CHECK: @llvm.ppc.vsx.xvcvuxdsp
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.vsx.xvcvuxdsp
+
+  res_vf = vec_floato(vd);
+// CHECK: @llvm.ppc.vsx.xvcvdpsp
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.vsx.xvcvdpsp
+
+  res_vd = vec_double(vsll);
+// CHECK: sitofp <2 x i64>
+// CHECK-LE: sitofp <2 x i64>
+
+  res_vd = vec_double(vull);
+// CHECK: uitofp <2 x i64>
+// CHECK-LE: uitofp <2 x i64>
+
+  res_vd = vec_doublee(vsi);
+// CHECK: @llvm.ppc.vsx.xvcvsxwdp(<4 x i32
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.vsx.xvcvsxwdp(<4 x i32
+
+  res_vd = vec_doublee(vui);
+// CHECK: @llvm.ppc.vsx.xvcvuxwdp(<4 x i32
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.vsx.xvcvuxwdp(<4 x i32
+
+  res_vd = vec_doublee(vf);
+// CHECK: @llvm.ppc.vsx.xvcvspdp(<4 x float
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+// CHECK-LE: @llvm.ppc.vsx.xvcvspdp(<4 x float
+
+  res_vd = vec_doubleh(vsi);
+// CHECK: extractelement <4 x i32>
+// CHECK: sitofp i32
+// CHECK: insertelement <2 x double>
+// CHECK: extractelement <4 x i32>
+// CHECK: sitofp i32
+// CHECK: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x i32>
+// CHECK-LE: sitofp i32
+// CHECK-LE: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x i32>
+// CHECK-LE: sitofp i32
+// CHECK-LE: insertelement <2 x double>
+
+  res_vd = vec_doubleh(vui);
+// CHECK: extractelement <4 x i32>
+// CHECK: uitofp i32
+// CHECK: insertelement <2 x double>
+// CHECK: extractelement <4 x i32>
+// CHECK: uitofp i32
+// CHECK: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x i32>
+// CHECK-LE: uitofp i32
+// CHECK-LE: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x i32>
+// CHECK-LE: uitofp i32
+// CHECK-LE: insertelement <2 x double>
+
+  res_vd = vec_doubleh(vf);
+// CHECK: extractelement <4 x float>
+// CHECK: fpext float
+// CHECK: insertelement <2 x double>
+// CHECK: extractelement <4 x float>
+// CHECK: fpext float
+// CHECK: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x float>
+// CHECK-LE: fpext float
+// CHECK-LE: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x float>
+// CHECK-LE: fpext float
+// CHECK-LE: insertelement <2 x double>
+
+  res_vd = vec_doublel(vsi);
+// CHECK: extractelement <4 x i32>
+// CHECK: sitofp i32
+// CHECK: insertelement <2 x double>
+// CHECK: extractelement <4 x i32>
+// CHECK: sitofp i32
+// CHECK: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x i32>
+// CHECK-LE: sitofp i32
+// CHECK-LE: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x i32>
+// CHECK-LE: sitofp i32
+// CHECK-LE: insertelement <2 x double>
+
+  res_vd = vec_doublel(vui);
+// CHECK: extractelement <4 x i32>
+// CHECK: uitofp i32
+// CHECK: insertelement <2 x double>
+// CHECK: extractelement <4 x i32>
+// CHECK: uitofp i32
+// CHECK: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x i32>
+// CHECK-LE: uitofp i32
+// CHECK-LE: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x i32>
+// CHECK-LE: uitofp i32
+// CHECK-LE: insertelement <2 x double>
+
+  res_vd = vec_doublel(vf);
+// CHECK: extractelement <4 x float>
+// CHECK: fpext float
+// CHECK: insertelement <2 x double>
+// CHECK: extractelement <4 x float>
+// CHECK: fpext float
+// CHECK: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x float>
+// CHECK-LE: fpext float
+// CHECK-LE: insertelement <2 x double>
+// CHECK-LE: extractelement <4 x float>
+// CHECK-LE: fpext float
+// CHECK-LE: insertelement <2 x double>
+
+  res_vd = vec_doubleo(vsi);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK: @llvm.ppc.vsx.xvcvsxwdp(<4 x i32>
+// CHECK-LE: @llvm.ppc.vsx.xvcvsxwdp(<4 x i32>
+
+  res_vd = vec_doubleo(vui);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK: @llvm.ppc.vsx.xvcvuxwdp(<4 x i32>
+// CHECK-LE: @llvm.ppc.vsx.xvcvuxwdp(<4 x i32>
+
+  res_vd = vec_doubleo(vf);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK: @llvm.ppc.vsx.xvcvspdp(<4 x float>
+// CHECK-LE: @llvm.ppc.vsx.xvcvspdp(<4 x float>
+
+  res_vbll = vec_reve(vbll);
+// CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 0>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 0>
+
+  res_vsll = vec_reve(vsll);
+// CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 0>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 0>
+
+  res_vull = vec_reve(vull);
+// CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 0>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 0>
+
+  res_vd = vec_reve(vd);
+// CHECK: shufflevector <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x i32> <i32 1, i32 0>
+// CHECK-LE: shufflevector <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x i32> <i32 1, i32 0>
+
+  res_vbll = vec_revb(vbll);
+// CHECK: store <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vsll = vec_revb(vsll);
+// CHECK: store <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vull = vec_revb(vull);
+// CHECK: store <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vd = vec_revb(vd);
+// CHECK: store <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>, <16 x i8>* {{%.+}}, align 16
+// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+// CHECK-LE: store <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* {{%.+}}, align 16
+// CHECK-LE: xor <16 x i8>
+// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> {{%.+}}, <4 x i32> {{%.+}}, <16 x i8> {{%.+}})
+
+  res_vbll = vec_sld(vbll, vbll, 0);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vsll = vec_sld(vsll, vsll, 0);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vull = vec_sld(vull, vull, 0);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vd = vec_sld(vd, vd, 0);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vsll = vec_sldw(vsll, vsll, 0);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vull = vec_sldw(vull, vull, 0);
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 1
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 2
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 3
+// CHECK: add nsw i32 {{[0-9a-zA-Z%.]+}}, 15
+// CHECK: @llvm.ppc.altivec.vperm
+// CHECK-LE: sub nsw i32 16
+// CHECK-LE: sub nsw i32 17
+// CHECK-LE: sub nsw i32 18
+// CHECK-LE: sub nsw i32 31
+// CHECK-LE: @llvm.ppc.altivec.vperm
+
+  res_vsll = vec_sll(vsll, vuc);
+// CHECK: @llvm.ppc.altivec.vsl
+// CHECK-LE: @llvm.ppc.altivec.vsl
+
+res_vull = vec_sll(vull, vuc);
+// CHECK: @llvm.ppc.altivec.vsl
+// CHECK-LE: @llvm.ppc.altivec.vsl
+
+res_vsll = vec_slo(vsll, vsc);
+// CHECK: @llvm.ppc.altivec.vslo
+// CHECK-LE: @llvm.ppc.altivec.vslo
+
+  res_vsll = vec_slo(vsll, vuc);
+// CHECK: @llvm.ppc.altivec.vslo
+// CHECK-LE: @llvm.ppc.altivec.vslo
+
+  res_vull = vec_slo(vull, vsc);
+// CHECK: @llvm.ppc.altivec.vslo
+// CHECK-LE: @llvm.ppc.altivec.vslo
+
+  res_vull = vec_slo(vull, vuc);
+// CHECK: @llvm.ppc.altivec.vslo
+// CHECK-LE: @llvm.ppc.altivec.vslo
+
+  res_vsll = vec_srl(vsll, vuc);
+// CHECK: @llvm.ppc.altivec.vsr
+// CHECK-LE: @llvm.ppc.altivec.vsr
+
+  res_vull = vec_srl(vull, vuc);
+// CHECK: @llvm.ppc.altivec.vsr
+// CHECK-LE: @llvm.ppc.altivec.vsr
+
+  res_vsll = vec_sro(vsll, vsc);
+// CHECK: @llvm.ppc.altivec.vsro
+// CHECK-LE: @llvm.ppc.altivec.vsro
+
+  res_vsll = vec_sro(vsll, vuc);
+// CHECK: @llvm.ppc.altivec.vsro
+// CHECK-LE: @llvm.ppc.altivec.vsro
+
+  res_vull = vec_sro(vull, vsc);
+// CHECK: @llvm.ppc.altivec.vsro
+// CHECK-LE: @llvm.ppc.altivec.vsro
+
+  res_vull = vec_sro(vull, vuc);
+// CHECK: @llvm.ppc.altivec.vsro
+// CHECK-LE: @llvm.ppc.altivec.vsro
+
+res_vsll = vec_xl(sll, asll);
+// CHECK: load <2 x i64>, <2 x i64>* %{{[0-9]+}}, align 16
+// CHECK-LE: load <2 x i64>, <2 x i64>* %{{[0-9]+}}, align 16
+
+res_vull = vec_xl(sll, aull);
+// CHECK: load <2 x i64>, <2 x i64>* %{{[0-9]+}}, align 16
+// CHECK-LE: load <2 x i64>, <2 x i64>* %{{[0-9]+}}, align 16
+
+res_vd = vec_xl(sll, ad);
+// CHECK: load <2 x double>, <2 x double>* %{{[0-9]+}}, align 16
+// CHECK-LE: load <2 x double>, <2 x double>* %{{[0-9]+}}, align 16
+
+vec_xst(vsll, sll, asll);
+// CHECK: store <2 x i64> %{{[0-9]+}}, <2 x i64>* %{{[0-9]+}}, align 16
+// CHECK-LE: store <2 x i64> %{{[0-9]+}}, <2 x i64>* %{{[0-9]+}}, align 16
+
+vec_xst(vull, sll, aull);
+// CHECK: store <2 x i64> %{{[0-9]+}}, <2 x i64>* %{{[0-9]+}}, align 16
+// CHECK-LE: store <2 x i64> %{{[0-9]+}}, <2 x i64>* %{{[0-9]+}}, align 16
+
+vec_xst(vd, sll, ad);
+// CHECK: store <2 x double> %{{[0-9]+}}, <2 x double>* %{{[0-9]+}}, align 16
+// CHECK-LE: store <2 x double> %{{[0-9]+}}, <2 x double>* %{{[0-9]+}}, align 16
+
+res_vsll = vec_xl_be(sll, asll);
+// CHECK: load <2 x i64>, <2 x i64>* %{{[0-9]+}}, align 16
+// CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
+
+res_vull = vec_xl_be(sll, aull);
+// CHECK: load <2 x i64>, <2 x i64>* %{{[0-9]+}}, align 16
+// CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
+
+res_vd = vec_xl_be(sll, ad);
+// CHECK: load <2 x double>, <2 x double>* %{{[0-9]+}}, align 16
+// CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
+
+vec_xst_be(vsll, sll, asll);
+// CHECK: store <2 x i64> %{{[0-9]+}}, <2 x i64>* %{{[0-9]+}}, align 16
+// CHECK-LE: call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+vec_xst_be(vull, sll, aull);
+// CHECK: store <2 x i64> %{{[0-9]+}}, <2 x i64>* %{{[0-9]+}}, align 16
+// CHECK-LE: call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+vec_xst_be(vd, sll, ad);
+// CHECK: store <2 x double> %{{[0-9]+}}, <2 x double>* %{{[0-9]+}}, align 16
+// CHECK-LE: call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %{{[0-9]+}}, i8* %{{[0-9]+}})
+
+  res_vf = vec_neg(vf);
+// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, {{%[0-9]+}}
+// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, {{%[0-9]+}}
+
+  res_vd = vec_neg(vd);
+// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, {{%[0-9]+}}
+// CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, {{%[0-9]+}}
 }
diff --git a/test/CodeGen/builtins-systemz-zvector-error.c b/test/CodeGen/builtins-systemz-zvector-error.c
index 8d5380dac161..b28e8d9bc1fc 100644
--- a/test/CodeGen/builtins-systemz-zvector-error.c
+++ b/test/CodeGen/builtins-systemz-zvector-error.c
@@ -233,38 +233,27 @@ void test_core(void) {
                                              // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
 
   vsc = vec_load_bndry(cptrsc, idx);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vsc = vec_load_bndry(cptrsc, 200);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vsc = vec_load_bndry(cptrsc, 32);    // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vsc = vec_load_bndry(cptrsc, 8192);  // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vuc = vec_load_bndry(cptruc, idx);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vss = vec_load_bndry(cptrss, idx);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vus = vec_load_bndry(cptrus, idx);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vsi = vec_load_bndry(cptrsi, idx);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vui = vec_load_bndry(cptrui, idx);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vsl = vec_load_bndry(cptrsl, idx);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
   vul = vec_load_bndry(cptrul, idx);   // expected-error {{no matching function}}
-                                       // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                       // expected-note@vecintrin.h:* 1 {{must be a constant power of 2 from 64 to 4096}}
+                                       // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
 
   vuc = vec_genmask(idx);  // expected-error {{no matching function}}
                            // expected-note@vecintrin.h:* {{must be a constant integer}}
diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c
index 8fa24e668f74..ec8a8bf868c5 100644
--- a/test/CodeGen/builtins-x86.c
+++ b/test/CodeGen/builtins-x86.c
@@ -262,7 +262,9 @@ void f0() {
   tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0);
 
   (void) __builtin_ia32_ldmxcsr(tmp_Ui);
+  (void) _mm_setcsr(tmp_Ui);
   tmp_Ui = __builtin_ia32_stmxcsr();
+  tmp_Ui = _mm_getcsr();
   (void)__builtin_ia32_fxsave(tmp_vp);
   (void)__builtin_ia32_fxsave64(tmp_vp);
   (void)__builtin_ia32_fxrstor(tmp_vp);
@@ -290,6 +292,7 @@ void f0() {
   tmp_i = __builtin_ia32_cvttss2si(tmp_V4f);
 
   tmp_i = __builtin_ia32_rdtsc();
+  tmp_i = __rdtsc();
   tmp_i = __builtin_ia32_rdtscp(&tmp_Ui);
   tmp_LLi = __builtin_ia32_rdpmc(tmp_i);
 #ifdef USE_64
@@ -304,6 +307,7 @@ void f0() {
   tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
   (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
   (void) __builtin_ia32_sfence();
+  (void) _mm_sfence();
 
   tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c);
   tmp_V4f = __builtin_ia32_rcpps(tmp_V4f);
@@ -339,8 +343,13 @@ void f0() {
   tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f);
   tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f);
   (void) __builtin_ia32_clflush(tmp_vCp);
+  (void) _mm_clflush(tmp_vCp);
   (void) __builtin_ia32_lfence();
+  (void) _mm_lfence();
   (void) __builtin_ia32_mfence();
+  (void) _mm_mfence();
+  (void) __builtin_ia32_pause();
+  (void) _mm_pause();
   tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
   tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
   tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
diff --git a/test/CodeGen/builtins.c b/test/CodeGen/builtins.c
index 405f2199af19..390c2e35bf9d 100644
--- a/test/CodeGen/builtins.c
+++ b/test/CodeGen/builtins.c
@@ -220,6 +220,10 @@ void test_float_builtins(float F, double D, long double LD) {
   // CHECK: call float @llvm.fabs.f32(float
   // CHECK: fcmp one float {{.*}}, 0x7FF0000000000000
 
+  res = finite(D);
+  // CHECK: call double @llvm.fabs.f64(double
+  // CHECK: fcmp one double {{.*}}, 0x7FF0000000000000
+
   res = __builtin_isnormal(F);
   // CHECK: fcmp oeq float
   // CHECK: call float @llvm.fabs.f32(float
@@ -353,6 +357,7 @@ void test_float_builtin_ops(float F, double D, long double LD) {
 
 // __builtin_longjmp isn't supported on all platforms, so only test it on X86.
 #ifdef __x86_64__
+
 // CHECK-LABEL: define void @test_builtin_longjmp
 void test_builtin_longjmp(void **buffer) {
   // CHECK: [[BITCAST:%.*]] = bitcast
@@ -360,6 +365,7 @@ void test_builtin_longjmp(void **buffer) {
   __builtin_longjmp(buffer, 1);
   // CHECK-NEXT: unreachable
 }
+
 #endif
 
 // CHECK-LABEL: define i64 @test_builtin_readcyclecounter
@@ -367,3 +373,234 @@ long long test_builtin_readcyclecounter() {
   // CHECK: call i64 @llvm.readcyclecounter()
   return __builtin_readcyclecounter();
 }
+
+// Behavior of __builtin_os_log differs between platforms, so only test on X86
+#ifdef __x86_64__
+
+// CHECK-LABEL: define void @test_builtin_os_log
+// CHECK: (i8* [[BUF:%.*]], i32 [[I:%.*]], i8* [[DATA:%.*]])
+void test_builtin_os_log(void *buf, int i, const char *data) {
+  volatile int len;
+  // CHECK: store i8* [[BUF]], i8** [[BUF_ADDR:%.*]], align 8
+  // CHECK: store i32 [[I]], i32* [[I_ADDR:%.*]], align 4
+  // CHECK: store i8* [[DATA]], i8** [[DATA_ADDR:%.*]], align 8
+
+  // CHECK: store volatile i32 34
+  len = __builtin_os_log_format_buffer_size("%d %{public}s %{private}.16P", i, data, data);
+
+  // CHECK: [[BUF2:%.*]] = load i8*, i8** [[BUF_ADDR]]
+  // CHECK: [[SUMMARY:%.*]] = getelementptr i8, i8* [[BUF2]], i64 0
+  // CHECK: store i8 3, i8* [[SUMMARY]]
+  // CHECK: [[NUM_ARGS:%.*]] = getelementptr i8, i8* [[BUF2]], i64 1
+  // CHECK: store i8 4, i8* [[NUM_ARGS]]
+  //
+  // CHECK: [[ARG1_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 2
+  // CHECK: store i8 0, i8* [[ARG1_DESC]]
+  // CHECK: [[ARG1_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 3
+  // CHECK: store i8 4, i8* [[ARG1_SIZE]]
+  // CHECK: [[ARG1:%.*]] = getelementptr i8, i8* [[BUF2]], i64 4
+  // CHECK: [[ARG1_INT:%.*]] = bitcast i8* [[ARG1]] to i32*
+  // CHECK: [[I2:%.*]] = load i32, i32* [[I_ADDR]]
+  // CHECK: store i32 [[I2]], i32* [[ARG1_INT]]
+
+  // CHECK: [[ARG2_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 8
+  // CHECK: store i8 34, i8* [[ARG2_DESC]]
+  // CHECK: [[ARG2_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 9
+  // CHECK: store i8 8, i8* [[ARG2_SIZE]]
+  // CHECK: [[ARG2:%.*]] = getelementptr i8, i8* [[BUF2]], i64 10
+  // CHECK: [[ARG2_PTR:%.*]] = bitcast i8* [[ARG2]] to i8**
+  // CHECK: [[DATA2:%.*]] = load i8*, i8** [[DATA_ADDR]]
+  // CHECK: store i8* [[DATA2]], i8** [[ARG2_PTR]]
+
+  // CHECK: [[ARG3_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 18
+  // CHECK: store i8 17, i8* [[ARG3_DESC]]
+  // CHECK: [[ARG3_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 19
+  // CHECK: store i8 4, i8* [[ARG3_SIZE]]
+  // CHECK: [[ARG3:%.*]] = getelementptr i8, i8* [[BUF2]], i64 20
+  // CHECK: [[ARG3_INT:%.*]] = bitcast i8* [[ARG3]] to i32*
+  // CHECK: store i32 16, i32* [[ARG3_INT]]
+
+  // CHECK: [[ARG4_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 24
+  // CHECK: store i8 49, i8* [[ARG4_DESC]]
+  // CHECK: [[ARG4_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 25
+  // CHECK: store i8 8, i8* [[ARG4_SIZE]]
+  // CHECK: [[ARG4:%.*]] = getelementptr i8, i8* [[BUF2]], i64 26
+  // CHECK: [[ARG4_PTR:%.*]] = bitcast i8* [[ARG4]] to i8**
+  // CHECK: [[DATA3:%.*]] = load i8*, i8** [[DATA_ADDR]]
+  // CHECK: store i8* [[DATA3]], i8** [[ARG4_PTR]]
+
+  __builtin_os_log_format(buf, "%d %{public}s %{private}.16P", i, data, data);
+}
+
+// CHECK-LABEL: define void @test_builtin_os_log_errno
+// CHECK: (i8* [[BUF:%.*]], i8* [[DATA:%.*]])
+void test_builtin_os_log_errno(void *buf, const char *data) {
+  volatile int len;
+  // CHECK: store i8* [[BUF]], i8** [[BUF_ADDR:%.*]], align 8
+  // CHECK: store i8* [[DATA]], i8** [[DATA_ADDR:%.*]], align 8
+
+  // CHECK: store volatile i32 2
+  len = __builtin_os_log_format_buffer_size("%S");
+
+  // CHECK: [[BUF2:%.*]] = load i8*, i8** [[BUF_ADDR]]
+  // CHECK: [[SUMMARY:%.*]] = getelementptr i8, i8* [[BUF2]], i64 0
+  // CHECK: store i8 2, i8* [[SUMMARY]]
+  // CHECK: [[NUM_ARGS:%.*]] = getelementptr i8, i8* [[BUF2]], i64 1
+  // CHECK: store i8 1, i8* [[NUM_ARGS]]
+
+  // CHECK: [[ARG1_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 2
+  // CHECK: store i8 96, i8* [[ARG1_DESC]]
+  // CHECK: [[ARG1_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 3
+  // CHECK: store i8 0, i8* [[ARG1_SIZE]]
+  // CHECK: [[ARG1:%.*]] = getelementptr i8, i8* [[BUF2]], i64 4
+  // CHECK: [[ARG1_INT:%.*]] = bitcast i8* [[ARG1]] to i32*
+  // CHECK: store i32 0, i32* [[ARG1_INT]]
+
+  __builtin_os_log_format(buf, "%m");
+}
+
+// CHECK-LABEL: define void @test_builtin_os_log_wide
+// CHECK: (i8* [[BUF:%.*]], i8* [[DATA:%.*]], i32* [[STR:%.*]])
+typedef int wchar_t;
+void test_builtin_os_log_wide(void *buf, const char *data, wchar_t *str) {
+  volatile int len;
+  // CHECK: store i8* [[BUF]], i8** [[BUF_ADDR:%.*]], align 8
+  // CHECK: store i8* [[DATA]], i8** [[DATA_ADDR:%.*]], align 8
+  // CHECK: store i32* [[STR]], i32** [[STR_ADDR:%.*]],
+
+  // CHECK: store volatile i32 12
+  len = __builtin_os_log_format_buffer_size("%S", str);
+
+  // CHECK: [[BUF2:%.*]] = load i8*, i8** [[BUF_ADDR]]
+  // CHECK: [[SUMMARY:%.*]] = getelementptr i8, i8* [[BUF2]], i64 0
+  // CHECK: store i8 2, i8* [[SUMMARY]]
+  // CHECK: [[NUM_ARGS:%.*]] = getelementptr i8, i8* [[BUF2]], i64 1
+  // CHECK: store i8 1, i8* [[NUM_ARGS]]
+
+  // CHECK: [[ARG1_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 2
+  // CHECK: store i8 80, i8* [[ARG1_DESC]]
+  // CHECK: [[ARG1_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 3
+  // CHECK: store i8 8, i8* [[ARG1_SIZE]]
+  // CHECK: [[ARG1:%.*]] = getelementptr i8, i8* [[BUF2]], i64 4
+  // CHECK: [[ARG1_PTR:%.*]] = bitcast i8* [[ARG1]] to i32**
+  // CHECK: [[STR2:%.*]] = load i32*, i32** [[STR_ADDR]]
+  // CHECK: store i32* [[STR2]], i32** [[ARG1_PTR]]
+
+  __builtin_os_log_format(buf, "%S", str);
+}
+
+// CHECK-LABEL: define void @test_builtin_os_log_precision_width
+// CHECK: (i8* [[BUF:%.*]], i8* [[DATA:%.*]], i32 [[PRECISION:%.*]], i32 [[WIDTH:%.*]])
+void test_builtin_os_log_precision_width(void *buf, const char *data,
+                                         int precision, int width) {
+  volatile int len;
+  // CHECK: store i8* [[BUF]], i8** [[BUF_ADDR:%.*]], align 8
+  // CHECK: store i8* [[DATA]], i8** [[DATA_ADDR:%.*]], align 8
+  // CHECK: store i32 [[PRECISION]], i32* [[PRECISION_ADDR:%.*]], align 4
+  // CHECK: store i32 [[WIDTH]], i32* [[WIDTH_ADDR:%.*]], align 4
+
+  // CHECK: store volatile i32 24,
+  len = __builtin_os_log_format_buffer_size("Hello %*.*s World", precision, width, data);
+
+  // CHECK: [[BUF2:%.*]] = load i8*, i8** [[BUF_ADDR]]
+  // CHECK: [[SUMMARY:%.*]] = getelementptr i8, i8* [[BUF2]], i64 0
+  // CHECK: store i8 2, i8* [[SUMMARY]]
+  // CHECK: [[NUM_ARGS:%.*]] = getelementptr i8, i8* [[BUF2]], i64 1
+  // CHECK: store i8 3, i8* [[NUM_ARGS]]
+
+  // CHECK: [[ARG1_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 2
+  // CHECK: store i8 0, i8* [[ARG1_DESC]]
+  // CHECK: [[ARG1_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 3
+  // CHECK: store i8 4, i8* [[ARG1_SIZE]]
+  // CHECK: [[ARG1:%.*]] = getelementptr i8, i8* [[BUF2]], i64 4
+  // CHECK: [[ARG1_INT:%.*]] = bitcast i8* [[ARG1]] to i32*
+  // CHECK: [[ARG1_VAL:%.*]] = load i32, i32* [[PRECISION_ADDR]]
+  // CHECK: store i32 [[ARG1_VAL]], i32* [[ARG1_INT]]
+
+  // CHECK: [[ARG2_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 8
+  // CHECK: store i8 16, i8* [[ARG2_DESC]]
+  // CHECK: [[ARG2_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 9
+  // CHECK: store i8 4, i8* [[ARG2_SIZE]]
+  // CHECK: [[ARG2:%.*]] = getelementptr i8, i8* [[BUF2]], i64 10
+  // CHECK: [[ARG2_INT:%.*]] = bitcast i8* [[ARG2]] to i32*
+  // CHECK: [[ARG2_VAL:%.*]] = load i32, i32* [[WIDTH_ADDR]]
+  // CHECK: store i32 [[ARG2_VAL]], i32* [[ARG2_INT]]
+
+  // CHECK: [[ARG3_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 14
+  // CHECK: store i8 32, i8* [[ARG3_DESC]]
+  // CHECK: [[ARG3_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 15
+  // CHECK: store i8 8, i8* [[ARG3_SIZE]]
+  // CHECK: [[ARG3:%.*]] = getelementptr i8, i8* [[BUF2]], i64 16
+  // CHECK: [[ARG3_PTR:%.*]] = bitcast i8* [[ARG3]] to i8**
+  // CHECK: [[DATA2:%.*]] = load i8*, i8** [[DATA_ADDR]]
+  // CHECK: store i8* [[DATA2]], i8** [[ARG3_PTR]]
+
+  __builtin_os_log_format(buf, "Hello %*.*s World", precision, width, data);
+}
+
+// CHECK-LABEL: define void @test_builtin_os_log_invalid
+// CHECK: (i8* [[BUF:%.*]], i32 [[DATA:%.*]])
+void test_builtin_os_log_invalid(void *buf, int data) {
+  volatile int len;
+  // CHECK: store i8* [[BUF]], i8** [[BUF_ADDR:%.*]], align 8
+  // CHECK: store i32 [[DATA]], i32* [[DATA_ADDR:%.*]]
+
+  // CHECK: store volatile i32 8,
+  len = __builtin_os_log_format_buffer_size("invalid specifier %: %d even a trailing one%", data);
+
+  // CHECK: [[BUF2:%.*]] = load i8*, i8** [[BUF_ADDR]]
+  // CHECK: [[SUMMARY:%.*]] = getelementptr i8, i8* [[BUF2]], i64 0
+  // CHECK: store i8 0, i8* [[SUMMARY]]
+  // CHECK: [[NUM_ARGS:%.*]] = getelementptr i8, i8* [[BUF2]], i64 1
+  // CHECK: store i8 1, i8* [[NUM_ARGS]]
+
+  // CHECK: [[ARG1_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 2
+  // CHECK: store i8 0, i8* [[ARG1_DESC]]
+  // CHECK: [[ARG1_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 3
+  // CHECK: store i8 4, i8* [[ARG1_SIZE]]
+  // CHECK: [[ARG1:%.*]] = getelementptr i8, i8* [[BUF2]], i64 4
+  // CHECK: [[ARG1_INT:%.*]] = bitcast i8* [[ARG1]] to i32*
+  // CHECK: [[ARG1_VAL:%.*]] = load i32, i32* [[DATA_ADDR]]
+  // CHECK: store i32 [[ARG1_VAL]], i32* [[ARG1_INT]]
+
+  __builtin_os_log_format(buf, "invalid specifier %: %d even a trailing one%", data);
+}
+
+// CHECK-LABEL: define void @test_builtin_os_log_percent
+// CHECK: (i8* [[BUF:%.*]], i8* [[DATA1:%.*]], i8* [[DATA2:%.*]])
+// Check that the %% which does not consume any argument is correctly handled
+void test_builtin_os_log_percent(void *buf, const char *data1, const char *data2) {
+  volatile int len;
+  // CHECK: store i8* [[BUF]], i8** [[BUF_ADDR:%.*]], align 8
+  // CHECK: store i8* [[DATA1]], i8** [[DATA1_ADDR:%.*]], align 8
+  // CHECK: store i8* [[DATA2]], i8** [[DATA2_ADDR:%.*]], align 8
+  // CHECK: store volatile i32 22
+  len = __builtin_os_log_format_buffer_size("%s %% %s", data1, data2);
+
+  // CHECK: [[BUF2:%.*]] = load i8*, i8** [[BUF_ADDR]]
+  // CHECK: [[SUMMARY:%.*]] = getelementptr i8, i8* [[BUF2]], i64 0
+  // CHECK: store i8 2, i8* [[SUMMARY]]
+  // CHECK: [[NUM_ARGS:%.*]] = getelementptr i8, i8* [[BUF2]], i64 1
+  // CHECK: store i8 2, i8* [[NUM_ARGS]]
+  //
+  // CHECK: [[ARG1_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 2
+  // CHECK: store i8 32, i8* [[ARG1_DESC]]
+  // CHECK: [[ARG1_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 3
+  // CHECK: store i8 8, i8* [[ARG1_SIZE]]
+  // CHECK: [[ARG1:%.*]] = getelementptr i8, i8* [[BUF2]], i64 4
+  // CHECK: [[ARG1_PTR:%.*]] = bitcast i8* [[ARG1]] to i8**
+  // CHECK: [[DATA1:%.*]] = load i8*, i8** [[DATA1_ADDR]]
+  // CHECK: store i8* [[DATA1]], i8** [[ARG1_PTR]]
+  //
+  // CHECK: [[ARG2_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 12
+  // CHECK: store i8 32, i8* [[ARG2_DESC]]
+  // CHECK: [[ARG2_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 13
+  // CHECK: store i8 8, i8* [[ARG2_SIZE]]
+  // CHECK: [[ARG2:%.*]] = getelementptr i8, i8* [[BUF2]], i64 14
+  // CHECK: [[ARG2_PTR:%.*]] = bitcast i8* [[ARG2]] to i8**
+  // CHECK: [[DATA2:%.*]] = load i8*, i8** [[DATA2_ADDR]]
+  // CHECK: store i8* [[DATA2]], i8** [[ARG2_PTR]]
+  __builtin_os_log_format(buf, "%s %% %s", data1, data2);
+}
+
+#endif
+\ No newline at end of file
diff --git a/test/CodeGen/captured-statements-nested.c b/test/CodeGen/captured-statements-nested.c
index b81705b8ad89..c078f72d51b2 100644
--- a/test/CodeGen/captured-statements-nested.c
+++ b/test/CodeGen/captured-statements-nested.c
@@ -69,7 +69,7 @@ void test_nest_captured_stmt(int param, int size, int param_arr[size]) {
         // CHECK1-NEXT: getelementptr inbounds %struct.A, %struct.A*
         // CHECK1-NEXT: store i8 99
         //
-        // CHECK1: [[SIZE_ADDR_REF:%.*]] = getelementptr inbounds [[T]], [[T]]* {{.*}}, i{{.+}} 0, i{{.+}} 7
+        // CHECK1-DAG: [[SIZE_ADDR_REF:%.*]] = getelementptr inbounds [[T]], [[T]]* {{.*}}, i{{.+}} 0, i{{.+}} 7
         // CHECK1-DAG: [[SIZE_ADDR:%.*]] = load i{{.+}}*, i{{.+}}** [[SIZE_ADDR_REF]]
         // CHECK1-DAG: [[SIZE:%.*]] = load i{{.+}}, i{{.+}}* [[SIZE_ADDR]]
         // CHECK1-DAG: [[PARAM_ARR_IDX:%.*]] = sub nsw i{{.+}} [[SIZE]], 1
@@ -79,7 +79,7 @@ void test_nest_captured_stmt(int param, int size, int param_arr[size]) {
         // CHECK1-DAG: [[PARAM_ARR_SIZE_MINUS_1_ADDR:%.*]] = getelementptr inbounds i{{.+}}, i{{.+}}* [[PARAM_ARR]], i{{.*}}
         // CHECK1: store i{{.+}} 2, i{{.+}}* [[PARAM_ARR_SIZE_MINUS_1_ADDR]]
         //
-        // CHECK1: [[Z_ADDR_REF:%.*]] = getelementptr inbounds [[T]], [[T]]* {{.*}}, i{{.+}} 0, i{{.+}} 2
+        // CHECK1-DAG: [[Z_ADDR_REF:%.*]] = getelementptr inbounds [[T]], [[T]]* {{.*}}, i{{.+}} 0, i{{.+}} 2
         // CHECK1-DAG: [[Z_ADDR:%.*]] = load %struct.A*, %struct.A** [[Z_ADDR_REF]]
         // CHECK1-DAG: [[Z_A_ADDR:%.*]] = getelementptr inbounds %struct.A, %struct.A* [[Z_ADDR]], i{{.+}} 0, i{{.+}} 0
         // CHECK1-DAG: [[ARR_IDX_2:%.*]] = load i{{.+}}, i{{.+}}* [[Z_A_ADDR]]
diff --git a/test/CodeGen/cfi-icall-cross-dso2.c b/test/CodeGen/cfi-icall-cross-dso2.c
new file mode 100644
index 000000000000..acc72a7ee766
--- /dev/null
+++ b/test/CodeGen/cfi-icall-cross-dso2.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 -fblocks \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso \
+// RUN:   -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define void @f() {{.*}} !type !{{.*}} !type !{{.*}}
+void f(void);
+void (*pf)(void) = f;
+void f(void) { }
+
+// Check that we do not crash on non-FunctionDecl definitions.
+void (^g)(void) = ^{};
diff --git a/test/CodeGen/cleanup-destslot-simple.c b/test/CodeGen/cleanup-destslot-simple.c
index 9b9f74eb2184..848a8dc847a9 100644
--- a/test/CodeGen/cleanup-destslot-simple.c
+++ b/test/CodeGen/cleanup-destslot-simple.c
@@ -13,8 +13,8 @@ int test() {
   return *p;
 // CHECK: [[X:%.*]] = alloca i32
 // CHECK: [[P:%.*]] = alloca i32*
-// LIFETIME: call void @llvm.lifetime.start(i64 4, i8* %{{.*}}){{( #[0-9]+)?}}, !dbg
-// LIFETIME: call void @llvm.lifetime.start(i64 8, i8* %{{.*}}){{( #[0-9]+)?}}, !dbg
+// LIFETIME: call void @llvm.lifetime.start(i64 4, i8* nonnull %{{.*}}){{( #[0-9]+)?}}, !dbg
+// LIFETIME: call void @llvm.lifetime.start(i64 8, i8* nonnull %{{.*}}){{( #[0-9]+)?}}, !dbg
 // CHECK-NOT: store i32 %{{.*}}, i32* %cleanup.dest.slot
 // LIFETIME: call void @llvm.lifetime.end(i64 8, {{.*}}){{( #[0-9]+)?}}, !dbg
 // LIFETIME: call void @llvm.lifetime.end(i64 4, {{.*}}){{( #[0-9]+)?}}, !dbg
diff --git a/test/CodeGen/code-coverage.c b/test/CodeGen/code-coverage.c
index 1b87d649dd97..80307f49ef34 100644
--- a/test/CodeGen/code-coverage.c
+++ b/test/CodeGen/code-coverage.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -emit-llvm -disable-red-zone -femit-coverage-data %s -o - | FileCheck %s
 // RUN: %clang_cc1 -emit-llvm -disable-red-zone -femit-coverage-data -coverage-no-function-names-in-data %s -o - | FileCheck %s --check-prefix WITHOUTNAMES
+// RUN: %clang_cc1 -emit-llvm -disable-red-zone -femit-coverage-data -coverage-notes-file=aaa.gcno -coverage-data-file=bbb.gcda -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 %s -o - | FileCheck %s --check-prefix GCOV_FILE_INFO
 
 // <rdar://problem/12843084>
 
@@ -28,3 +29,6 @@ int test1(int a) {
 // CHECK: void @__llvm_gcov_init() unnamed_addr [[NRZ]]
 
 // CHECK: attributes [[NRZ]] = { {{.*}}noredzone{{.*}} }
+
+// GCOV_FILE_INFO: !llvm.gcov = !{![[GCOV:[0-9]+]]}
+// GCOV_FILE_INFO: ![[GCOV]] = !{!"aaa.gcno", !"bbb.gcda", !{{[0-9]+}}}
diff --git a/test/CodeGen/compound-literal.c b/test/CodeGen/compound-literal.c
index 85138bfaf5e0..6507341ce2c1 100644
--- a/test/CodeGen/compound-literal.c
+++ b/test/CodeGen/compound-literal.c
@@ -1,5 +1,10 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm %s -o - | FileCheck %s
 
+// Capture the type and name so matching later is cleaner.
+struct CompoundTy { int a; };
+// CHECK: @MyCLH = constant [[MY_CLH:[^,]+]]
+const struct CompoundTy *const MyCLH = &(struct CompoundTy){3};
+
 int* a = &(int){1};
 struct s {int a, b, c;} * b = &(struct s) {1, 2, 3};
 _Complex double * x = &(_Complex double){1.0f};
@@ -66,3 +71,14 @@ struct G g(int x, int y, int z) {
   // CHECK-NEXT: [[T0:%.*]] = load i48, i48* [[COERCE_TEMP]]
   // CHECK-NEXT: ret i48 [[T0]]
 }
+
+// We had a bug where we'd emit a new GlobalVariable for each time we used a
+// const pointer to a variable initialized by a compound literal.
+// CHECK-LABEL: define i32 @compareMyCLH() #0
+int compareMyCLH() {
+  // CHECK: store i8* bitcast ([[MY_CLH]] to i8*)
+  const void *a = MyCLH;
+  // CHECK: store i8* bitcast ([[MY_CLH]] to i8*)
+  const void *b = MyCLH;
+  return a == b;
+}
diff --git a/test/CodeGen/dbg-const-int128.c b/test/CodeGen/dbg-const-int128.c
new file mode 100644
index 000000000000..966c22705500
--- /dev/null
+++ b/test/CodeGen/dbg-const-int128.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -S -emit-llvm -debug-info-kind=limited  %s -o - | FileCheck %s
+// CHECK: !DIGlobalVariable({{.*}}
+// CHECK-NOT: expr:
+
+static const __uint128_t ro = 18446744073709551615;
+
+void bar(__uint128_t);
+void foo() { bar(ro); }
diff --git a/test/CodeGen/debug-info-atomic.c b/test/CodeGen/debug-info-atomic.c
new file mode 100644
index 000000000000..3de0d3585d29
--- /dev/null
+++ b/test/CodeGen/debug-info-atomic.c
@@ -0,0 +1,8 @@
+// RUN: %clang -g -c -std=c11 -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: !DIGlobalVariable(name: "i"
+// CHECK-SAME: type: ![[T:.*]], isLocal: false, isDefinition: true)
+// CHECK: ![[T]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: ![[BT:.*]])
+// CHECK: ![[BT]] = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: ![[BTT:.*]])
+// CHECK: ![[BTT]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+_Atomic const int i;
diff --git a/test/CodeGen/debug-info-file-checksum.c b/test/CodeGen/debug-info-file-checksum.c
new file mode 100644
index 000000000000..2750800d41ea
--- /dev/null
+++ b/test/CodeGen/debug-info-file-checksum.c
@@ -0,0 +1,5 @@
+// RUN: %clang -emit-llvm -S -g -gcodeview -x c %S/Inputs/debug-info-file-checksum.c -o - | FileCheck %s
+
+// Check that "checksum" is created correctly for the compiled file.
+
+// CHECK: !DIFile(filename:{{.*}}, directory:{{.*}}, checksumkind: CSK_MD5, checksum: "a3b7d27af071accdeccaa933fc603608")
diff --git a/test/CodeGen/debug-info-global-constant.c b/test/CodeGen/debug-info-global-constant.c
new file mode 100644
index 000000000000..4175f24675e5
--- /dev/null
+++ b/test/CodeGen/debug-info-global-constant.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone \
+// RUN:      -triple %itanium_abi_triple %s -o - | FileCheck %s
+
+// Debug info for a global constant whose address is taken should be emitted
+// exactly once.
+
+// CHECK: @i = internal constant i32 1, align 4, !dbg ![[I:[0-9]+]]
+// CHECK: ![[I]] = !DIGlobalVariableExpression(var: ![[VAR:.*]], expr: ![[EXPR:[0-9]+]])
+// CHECK: ![[VAR]] = distinct !DIGlobalVariable(name: "i",
+// CHECK: !DICompileUnit({{.*}}globals: ![[GLOBALS:[0-9]+]])
+// CHECK: ![[GLOBALS]] = !{![[I]]}
+// CHECK: ![[EXPR]] = !DIExpression(DW_OP_constu, 1, DW_OP_stack_value)
+static const int i = 1;
+
+void g(const int *, int);
+void f() {
+  g(&i, i);
+}
diff --git a/test/CodeGen/debug-info-imported-entity.cpp b/test/CodeGen/debug-info-imported-entity.cpp
index 105cc3dc5371..c7935eda48c5 100644
--- a/test/CodeGen/debug-info-imported-entity.cpp
+++ b/test/CodeGen/debug-info-imported-entity.cpp
@@ -3,9 +3,7 @@
 namespace std { class A; }
 using std::A; using ::A;
 
-
-// CHECK: [[CompileUnit:![0-9]+]] = distinct !DICompileUnit({{.+}} imports: [[Imports:![0-9]+]])
+// CHECK: [[CompileUnit:![0-9]+]] = distinct !DICompileUnit({{.+}} imports: [[Imports:![0-9]+]]
 // CHECK: [[Imports]] = !{[[ImportedEntity:![0-9]+]]}
 // CHECK: [[ImportedEntity]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[CompileUnit]], entity: [[STDA:![0-9]+]], line: 4)
 // CHECK: [[STDA]] = !DICompositeType(tag: DW_TAG_class_type, name: "A",
-
diff --git a/test/CodeGen/debug-info-packed-struct.c b/test/CodeGen/debug-info-packed-struct.c
index 8c1a0d4cfdb3..6441a740e379 100644
--- a/test/CodeGen/debug-info-packed-struct.c
+++ b/test/CodeGen/debug-info-packed-struct.c
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -x c -debug-info-kind=limited -emit-llvm -triple x86_64-apple-darwin -o - %s | FileCheck %s
 
+// CHECK: %struct.layout3 = type <{ i8, [3 x i8], %struct.size8_pack4, i8, [3 x i8] }>
 // CHECK: %struct.layout0 = type { i8, %struct.size8, i8 }
 // CHECK: %struct.layout1 = type <{ i8, %struct.size8_anon, i8, [2 x i8] }>
 // CHECK: %struct.layout2 = type <{ i8, %struct.size8_pack1, i8 }>
-// CHECK: %struct.layout3 = type <{ i8, [3 x i8], %struct.size8_pack4, i8, [3 x i8] }>
 
 // ---------------------------------------------------------------------
 // Not packed.
@@ -19,9 +19,9 @@ struct layout0 {
 };
 // CHECK: l0_ofs0
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l0_ofs8",
-// CHECK-SAME:     {{.*}}size: 64, align: 64, offset: 64)
+// CHECK-SAME:     {{.*}}size: 64, offset: 64)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l0_ofs16",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 128, flags: DIFlagBitField, extraData: i64 128)
+// CHECK-SAME:     {{.*}}size: 1, offset: 128, flags: DIFlagBitField, extraData: i64 128)
 
 
 // ---------------------------------------------------------------------
@@ -38,9 +38,9 @@ struct layout1 {
 };
 // CHECK: l1_ofs0
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l1_ofs1",
-// CHECK-SAME:     {{.*}}size: 64, align: 8, offset: 8)
+// CHECK-SAME:     {{.*}}size: 64, offset: 8)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l1_ofs9",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72, flags: DIFlagBitField, extraData: i64 72)
+// CHECK-SAME:     {{.*}}size: 1, offset: 72, flags: DIFlagBitField, extraData: i64 72)
 
 
 // ---------------------------------------------------------------------
@@ -59,9 +59,9 @@ struct layout2 {
 #pragma pack()
 // CHECK: l2_ofs0
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l2_ofs1",
-// CHECK-SAME:     {{.*}}size: 64, align: 8, offset: 8)
+// CHECK-SAME:     {{.*}}size: 64, offset: 8)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l2_ofs9",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72, flags: DIFlagBitField, extraData: i64 72)
+// CHECK-SAME:     {{.*}}size: 1, offset: 72, flags: DIFlagBitField, extraData: i64 72)
 
 
 
@@ -81,11 +81,11 @@ struct layout3 {
 #pragma pack()
 // CHECK: l3_ofs0
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l3_ofs4",
-// CHECK-SAME:     {{.*}}size: 64, align: 32, offset: 32)
+// CHECK-SAME:     {{.*}}size: 64, offset: 32)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l3_ofs12",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 96, flags: DIFlagBitField, extraData: i64 96)
+// CHECK-SAME:     {{.*}}size: 1, offset: 96, flags: DIFlagBitField, extraData: i64 96)
 
+struct layout3 l3;
 struct layout0 l0;
 struct layout1 l1;
 struct layout2 l2;
-struct layout3 l3;
diff --git a/test/CodeGen/debug-info-static-const-fp.c b/test/CodeGen/debug-info-static-const-fp.c
new file mode 100644
index 000000000000..4dfe057f2b6e
--- /dev/null
+++ b/test/CodeGen/debug-info-static-const-fp.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -emit-llvm -O0 -debug-info-kind=limited %s -o - | \
+// RUN:   FileCheck --check-prefixes CHECK %s
+
+// RUN: %clang_cc1 -triple hexagon-unknown--elf -emit-llvm -O0 -debug-info-kind=limited %s -o - | \
+// RUN:   FileCheck --check-prefixes CHECK,CHECK-LDsm %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -O0 -debug-info-kind=limited %s -o - | \
+// RUN:   FileCheck --check-prefixes CHECK,CHECK-LDlg %s
+
+// Per PR26619, check that for referenced static const of floating-point type,
+// we emit its constant value in debug info.
+//
+// NOTE that __fp16 is assumed to be 16 bits, float is assumed to be
+// 32 bits, and double is assumed to be 64 bits.  Size of long double
+// is not known (for example, it is 64 bits for hexagon-unknown--elf,
+// but 128 bits for x86_64-unknown-linux-gnu).  Therefore, we specify
+// target triples where it has a known size, and check accordingly:
+// for the absence of a constant (CHECK-LDlg) when the size exceeds 64
+// bits, and for the presence of a constant (CHECK-LDsm) but not its
+// value when the size does not exceed 64 bits.
+//
+// NOTE that PR26619 is not yet fixed for types greater than 64 bits.
+
+static const __fp16 hVal = 29/13.0f;            //    2.2307692307692307692     (2.23046875)
+
+static const float fVal = -147/17.0f;           //   -8.6470588235294117647     (-8.64705849)
+
+static const double dVal = 19637/7.0;           // 2805.2857142857142857        (2805.2857142857142)
+
+static const long double ldVal = 3/1234567.0L;  //    2.4300017739012949479e-06 (<depends on size of long double>)
+
+int main() {
+  return hVal + fVal + dVal + ldVal;
+}
+
+// CHECK: !DIGlobalVariableExpression(var: [[HVAL:.*]], expr: [[HEXPR:.*]])
+// CHECK: [[HVAL]] = distinct !DIGlobalVariable(name: "hVal",
+// CHECK-SAME:                                  isLocal: true, isDefinition: true
+// CHECK: [[HEXPR]] = !DIExpression(DW_OP_constu, 16502, DW_OP_stack_value)
+
+// CHECK: !DIGlobalVariableExpression(var: [[FVAL:.*]], expr: [[FEXPR:.*]])
+// CHECK: [[FVAL]] = distinct !DIGlobalVariable(name: "fVal",
+// CHECK-SAME:                                  isLocal: true, isDefinition: true
+// CHECK: [[FEXPR]] = !DIExpression(DW_OP_constu, 3238681178, DW_OP_stack_value)
+
+// CHECK: !DIGlobalVariableExpression(var: [[DVAL:.*]], expr: [[DEXPR:.*]])
+// CHECK: [[DVAL]] = distinct !DIGlobalVariable(name: "dVal",
+// CHECK-SAME:                                  isLocal: true, isDefinition: true
+// CHECK: [[DEXPR]] = !DIExpression(DW_OP_constu, 4658387303597904457, DW_OP_stack_value)
+
+// CHECK-LDlg-DAG: [[LDVAL:.*]] = distinct !DIGlobalVariable(name: "ldVal", {{.*}}, isLocal: true, isDefinition: true)
+// CHECK-LDlg-DAG: !DIGlobalVariableExpression(var: [[LDVAL]])
+// CHECK-LDsm-DAG: [[LDVAL:.*]] = distinct !DIGlobalVariable(name: "ldVal", {{.*}}, isLocal: true, isDefinition: true)
+// CHECK-LDsm-DAG: !DIGlobalVariableExpression(var: [[LDVAL]], expr:
diff --git a/test/CodeGen/debug-info-static.c b/test/CodeGen/debug-info-static.c
index fbe2a0098f70..016f1e6e6cc5 100644
--- a/test/CodeGen/debug-info-static.c
+++ b/test/CodeGen/debug-info-static.c
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1  -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
 
-// CHECK: !DIGlobalVariable({{.*}}variable: i32* @f.xyzzy
+// CHECK: @f.xyzzy = internal global i32 0, align 4, !dbg [[XYZZY:![0-9]+]]
+
+// CHECK: [[XYZZY]] = !DIGlobalVariableExpression(var: [[VAR:.*]])
+// CHECK: [[VAR]] = distinct !DIGlobalVariable
 void f(void)
 {
    static int xyzzy;
diff --git a/test/CodeGen/debug-info-vector.c b/test/CodeGen/debug-info-vector.c
index 6b27573578c2..93e63926c93d 100644
--- a/test/CodeGen/debug-info-vector.c
+++ b/test/CodeGen/debug-info-vector.c
@@ -6,6 +6,6 @@ v4si a;
 // Test that we get an array type that's also a vector out of debug.
 // CHECK: !DICompositeType(tag: DW_TAG_array_type,
 // CHECK-SAME:             baseType: ![[INT:[0-9]+]]
-// CHECK-SAME:             size: 128, align: 128
+// CHECK-SAME:             size: 128
 // CHECK-SAME:             DIFlagVector
 // CHECK: ![[INT]] = !DIBasicType(name: "int"
diff --git a/test/CodeGen/decl-in-prototype.c b/test/CodeGen/decl-in-prototype.c
index 15efa6551bc3..426adf4b765c 100644
--- a/test/CodeGen/decl-in-prototype.c
+++ b/test/CodeGen/decl-in-prototype.c
@@ -1,4 +1,4 @@
-// RUN: %clang -target i386-unknown-unknown -emit-llvm -S -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-linux -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
 
 const int AA = 5;
 
@@ -19,3 +19,8 @@ int f(void (*g)(), enum {AA,BB} h) {
     // CHECK: ret i32 0
     return AA;
 }
+
+// This used to crash with debug info enabled.
+int pr31366(struct { enum { a = 1 } b; } c) {
+  return a;
+}
diff --git a/test/CodeGen/denormalfpmode.c b/test/CodeGen/denormalfpmode.c
new file mode 100644
index 000000000000..b0013daefbf8
--- /dev/null
+++ b/test/CodeGen/denormalfpmode.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -S -fdenormal-fp-math=ieee %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-IEEE
+// RUN: %clang_cc1 -S -fdenormal-fp-math=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-PS
+// RUN: %clang_cc1 -S -fdenormal-fp-math=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-PZ
+
+// CHECK-LABEL: main
+// CHECK-IEEE: attributes #0 = {{.*}}"denormal-fp-math"="ieee"{{.*}}
+// CHECK-PS: attributes #0 = {{.*}}"denormal-fp-math"="preserve-sign"{{.*}}
+// CHECK-PZ: attributes #0 = {{.*}}"denormal-fp-math"="positive-zero"{{.*}}
+
+int main() {
+  return 0;
+}
diff --git a/test/CodeGen/dwarf-version.c b/test/CodeGen/dwarf-version.c
index 2171ed6ad97d..cee0f031e130 100644
--- a/test/CodeGen/dwarf-version.c
+++ b/test/CodeGen/dwarf-version.c
@@ -4,7 +4,13 @@
 // RUN: %clang -target x86_64-linux-gnu -gdwarf-5 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER5
 // RUN: %clang -target x86_64-linux-gnu -g -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER4
 // RUN: %clang -target x86_64-linux-gnu -gdwarf -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER4
-// RUN: %clang -target x86_64-apple-darwin -g -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER2
+
+// The -isysroot is used as a hack to avoid LIT messing with the SDKROOT
+// environment variable which indirecty overrides the version in the target
+// triple used here.
+// RUN: %clang -target x86_64-apple-macosx10.11 -g -S -emit-llvm -o - %s -isysroot %t | FileCheck %s --check-prefix=VER4
+// RUN: %clang -target x86_64-apple-darwin14 -g -S -emit-llvm -o - %s -isysroot %t | FileCheck %s --check-prefix=VER2
+
 // RUN: %clang -target powerpc-unknown-openbsd -g -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER2
 // RUN: %clang -target powerpc-unknown-freebsd -g -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER2
 // RUN: %clang -target i386-pc-solaris -g -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER2
diff --git a/test/CodeGen/ext-vector.c b/test/CodeGen/ext-vector.c
index 0b78e97fbb36..beb58827ea13 100644
--- a/test/CodeGen/ext-vector.c
+++ b/test/CodeGen/ext-vector.c
@@ -301,3 +301,40 @@ void test17(void) {
   char valC;
   char16 destVal = valC ? valA : valB;
 }
+
+typedef __attribute__(( ext_vector_type(16) )) float float16;
+
+float16 vec16, vec16_2;
+
+// CHECK: @test_rgba
+void test_rgba() {
+  // CHECK: fadd <4 x float>
+  vec4_2 = vec4.abgr + vec4;
+
+  // CHECK: shufflevector {{.*}} <i32 0, i32 1>
+  vec2 = vec4.rg;
+  // CHECK: shufflevector {{.*}} <i32 2, i32 3>
+  vec2_2 = vec4.ba;
+  // CHECK: extractelement {{.*}} 2
+  f = vec4.b;
+  // CHECK: shufflevector {{.*}} <i32 2, i32 2, i32 2, i32 2>
+  vec4_2 = vec4_2.bbbb;
+
+  // CHECK: insertelement {{.*}} 0
+  vec2.r = f;
+  // CHECK: shufflevector {{.*}} <i32 1, i32 0>
+  vec2.gr = vec2;
+
+  // CHECK: extractelement {{.*}} 0
+  f = vec4_2.rg.r;
+  // CHECK: shufflevector {{.*}} <i32 2, i32 1, i32 0>
+  // CHECK: shufflevector {{.*}} <i32 0, i32 1, i32 2, i32 undef>
+  // CHECK: shufflevector {{.*}} <i32 4, i32 5, i32 6, i32 3>
+  vec4.rgb = vec4.bgr;
+
+  // CHECK: extractelement {{.*}} 11
+  // CHECK: insertelement {{.*}} 2
+  vec4.b = vec16.sb;
+  // CHECK: shufflevector {{.*}} <i32 10, i32 11, i32 12, i32 13>
+  vec4_2 = vec16.sabcd;
+}
diff --git a/test/CodeGen/f16c-builtins.c b/test/CodeGen/f16c-builtins.c
index e99c0d00d4b3..e4933e9f9bf3 100644
--- a/test/CodeGen/f16c-builtins.c
+++ b/test/CodeGen/f16c-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +f16c -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +f16c -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/fixup-depth-overflow.c b/test/CodeGen/fixup-depth-overflow.c
index be8f54284ec7..af452d05b643 100644
--- a/test/CodeGen/fixup-depth-overflow.c
+++ b/test/CodeGen/fixup-depth-overflow.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O1 -disable-llvm-optzns -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -O1 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 #define M if (x) goto L1;
 #define M10 M M M M M M M M M M
diff --git a/test/CodeGen/fma-builtins.c b/test/CodeGen/fma-builtins.c
index 922f12b8b3d3..91a2efe7637e 100644
--- a/test/CodeGen/fma-builtins.c
+++ b/test/CodeGen/fma-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +fma -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +fma -emit-llvm -o - | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/fma4-builtins.c b/test/CodeGen/fma4-builtins.c
index 69cbcd83d370..3ca5fac68892 100644
--- a/test/CodeGen/fma4-builtins.c
+++ b/test/CodeGen/fma4-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +fma4 -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +fma4 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/fsgsbase-builtins.c b/test/CodeGen/fsgsbase-builtins.c
index 5e9ba8c9dde4..25f4b311a596 100644
--- a/test/CodeGen/fsgsbase-builtins.c
+++ b/test/CodeGen/fsgsbase-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +fsgsbase -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +fsgsbase -emit-llvm -o - | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/func-in-block.c b/test/CodeGen/func-in-block.c
index 503695f8c378..937390c1cdb6 100644
--- a/test/CodeGen/func-in-block.c
+++ b/test/CodeGen/func-in-block.c
@@ -15,5 +15,5 @@ int main()
     return 0; // not reached
 }
 
-// CHECK: @__func__.__main_block_invoke = private unnamed_addr constant [20 x i8] c"__main_block_invoke\00"
-// CHECK: call void @PRINTF({{.*}}@__func__.__main_block_invoke 
+// CHECK: @__func__.__main_block_invoke = private unnamed_addr constant [18 x i8] c"main_block_invoke\00"
+// CHECK: call void @PRINTF({{.*}}@__func__.__main_block_invoke
diff --git a/test/CodeGen/function-attributes.c b/test/CodeGen/function-attributes.c
index 8f682a715d73..2139f6fe6546 100644
--- a/test/CodeGen/function-attributes.c
+++ b/test/CodeGen/function-attributes.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm -disable-llvm-optzns -Os -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm -disable-llvm-optzns -Os -std=c99 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm -disable-llvm-passes -Os -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm -disable-llvm-passes -Os -std=c99 -o - %s | FileCheck %s
 // CHECK: define signext i8 @f0(i32 %x) [[NUW:#[0-9]+]]
 // CHECK: define zeroext i8 @f1(i32 %x) [[NUW]]
 // CHECK: define void @f2(i8 signext %x) [[NUW]]
diff --git a/test/CodeGen/incomplete-function-type-2.c b/test/CodeGen/incomplete-function-type-2.c
index 41dd5fec4b25..4ed065a5f868 100644
--- a/test/CodeGen/incomplete-function-type-2.c
+++ b/test/CodeGen/incomplete-function-type-2.c
@@ -16,4 +16,4 @@ void test10_foo(test10_F3 p1)
   p1(0.0);
 }
 
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/inline-optim.c b/test/CodeGen/inline-optim.c
index f8b355afd9c1..c6038de173f5 100644
--- a/test/CodeGen/inline-optim.c
+++ b/test/CodeGen/inline-optim.c
@@ -1,9 +1,15 @@
 // Make sure -finline-functions family flags are behaving correctly.
-
+//
+// REQUIRES: x86-registered-target
+//
 // RUN: %clang_cc1 -triple i686-pc-win32 -emit-llvm %s -o - | FileCheck -check-prefix=NOINLINE %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fexperimental-new-pass-manager -emit-llvm %s -o - | FileCheck -check-prefix=NOINLINE %s
 // RUN: %clang_cc1 -triple i686-pc-win32 -O3 -fno-inline-functions -emit-llvm %s -o - | FileCheck -check-prefix=NOINLINE %s
-// RUN: %clang_cc1 -triple i686-pc-win32 -finline-hint-functions -emit-llvm %s -o - | FileCheck -check-prefix=HINT %s
-// RUN: %clang_cc1 -triple i686-pc-win32 -finline-functions -emit-llvm %s -o - | FileCheck -check-prefix=INLINE %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fexperimental-new-pass-manager -O3 -fno-inline-functions -emit-llvm %s -o - | FileCheck -check-prefix=NOINLINE %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -O3 -finline-hint-functions -emit-llvm %s -o - | FileCheck -check-prefix=HINT %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fexperimental-new-pass-manager -O3 -finline-hint-functions -emit-llvm %s -o - | FileCheck -check-prefix=HINT %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -O3 -finline-functions -emit-llvm %s -o - | FileCheck -check-prefix=INLINE %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fexperimental-new-pass-manager -O3 -finline-functions -emit-llvm %s -o - | FileCheck -check-prefix=INLINE %s
 
 inline int inline_hint(int a, int b) { return(a+b); }
 
diff --git a/test/CodeGen/inline.c b/test/CodeGen/inline.c
index fe7efe36e653..e17b251df000 100644
--- a/test/CodeGen/inline.c
+++ b/test/CodeGen/inline.c
@@ -1,5 +1,8 @@
+// REQUIRES: x86-registered-target
+//
 // RUN: echo "GNU89 tests:"
-// RUN: %clang_cc1 %s -triple i386-unknown-unknown -O1 -disable-llvm-optzns -emit-llvm -o - -std=gnu89 | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 %s -triple i386-unknown-unknown -O1 -disable-llvm-passes -emit-llvm -o - -std=gnu89 | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 %s -triple i386-unknown-unknown -fexperimental-new-pass-manager -O1 -disable-llvm-passes -emit-llvm -o - -std=gnu89 | FileCheck %s --check-prefix=CHECK1
 // CHECK1-LABEL: define i32 @foo()
 // CHECK1-LABEL: define i32 @bar()
 // CHECK1-LABEL: define void @unreferenced1()
@@ -21,7 +24,8 @@
 // CHECK1-LABEL: define available_externally void @gnu_ei_inline()
 
 // RUN: echo "C99 tests:"
-// RUN: %clang_cc1 %s -triple i386-unknown-unknown -O1 -disable-llvm-optzns -emit-llvm -o - -std=gnu99 | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 %s -triple i386-unknown-unknown -O1 -disable-llvm-passes -emit-llvm -o - -std=gnu99 | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 %s -triple i386-unknown-unknown -fexperimental-new-pass-manager -O1 -disable-llvm-passes -emit-llvm -o - -std=gnu99 | FileCheck %s --check-prefix=CHECK2
 // CHECK2-LABEL: define i32 @ei()
 // CHECK2-LABEL: define i32 @bar()
 // CHECK2-NOT: unreferenced1
@@ -43,7 +47,8 @@
 // CHECK2-LABEL: define available_externally void @gnu_ei_inline()
 
 // RUN: echo "C++ tests:"
-// RUN: %clang_cc1 -x c++ %s -triple i386-unknown-unknown -O1 -disable-llvm-optzns -emit-llvm -o - -std=c++98 | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -x c++ %s -triple i386-unknown-unknown -O1 -disable-llvm-passes -emit-llvm -o - -std=c++98 | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -x c++ %s -triple i386-unknown-unknown -fexperimental-new-pass-manager -O1 -disable-llvm-passes -emit-llvm -o - -std=c++98 | FileCheck %s --check-prefix=CHECK3
 // CHECK3-LABEL: define i32 @_Z3barv()
 // CHECK3-LABEL: define linkonce_odr i32 @_Z3foov()
 // CHECK3-NOT: unreferenced
@@ -53,7 +58,8 @@
 // CHECK3-LABEL: define linkonce_odr i32 @_Z2eiv()
 
 // RUN: echo "MS C Mode tests:"
-// RUN: %clang_cc1 %s -triple i386-pc-win32 -O1 -disable-llvm-optzns -emit-llvm -o - -std=c99 | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 %s -triple i386-pc-win32 -O1 -disable-llvm-passes -emit-llvm -o - -std=c99 | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 %s -triple i386-pc-win32 -fexperimental-new-pass-manager -O1 -disable-llvm-passes -emit-llvm -o - -std=c99 | FileCheck %s --check-prefix=CHECK4
 // CHECK4-NOT: define weak_odr void @_Exit(
 // CHECK4-LABEL: define weak_odr i32 @ei()
 // CHECK4-LABEL: define i32 @bar()
diff --git a/test/CodeGen/lifetime2.c b/test/CodeGen/lifetime2.c
index ffff5cca12f0..0d22282fdd43 100644
--- a/test/CodeGen/lifetime2.c
+++ b/test/CodeGen/lifetime2.c
@@ -1,8 +1,9 @@
-// RUN: %clang -S -emit-llvm -o - -O2 %s | FileCheck %s -check-prefix=O2
-// RUN: %clang -S -emit-llvm -o - -O0 %s | FileCheck %s -check-prefix=O0
+// RUN: %clang -S -emit-llvm -o - -O2 %s | FileCheck %s -check-prefixes=CHECK,O2
+// RUN: %clang -S -emit-llvm -o - -O0 %s | FileCheck %s -check-prefixes=CHECK,O0
 
 extern int bar(char *A, int n);
 
+// CHECK-LABEL: @foo
 // O0-NOT: @llvm.lifetime.start
 int foo (int n) {
   if (n) {
@@ -15,3 +16,76 @@ int foo (int n) {
     return bar(A, 2);
   }
 }
+
+// CHECK-LABEL: @no_goto_bypass
+void no_goto_bypass() {
+  // O2: @llvm.lifetime.start(i64 1
+  char x;
+l1:
+  bar(&x, 1);
+  // O2: @llvm.lifetime.start(i64 5
+  // O2: @llvm.lifetime.end(i64 5
+  char y[5];
+  bar(y, 5);
+  goto l1;
+  // Infinite loop
+  // O2-NOT: @llvm.lifetime.end(i64 1
+}
+
+// CHECK-LABEL: @goto_bypass
+void goto_bypass() {
+  {
+    // O2-NOT: @llvm.lifetime.start(i64 1
+    // O2-NOT: @llvm.lifetime.end(i64 1
+    char x;
+  l1:
+    bar(&x, 1);
+  }
+  goto l1;
+}
+
+// CHECK-LABEL: @no_switch_bypass
+void no_switch_bypass(int n) {
+  switch (n) {
+  case 1: {
+    // O2: @llvm.lifetime.start(i64 1
+    // O2: @llvm.lifetime.end(i64 1
+    char x;
+    bar(&x, 1);
+    break;
+  }
+  case 2:
+    n = n;
+    // O2: @llvm.lifetime.start(i64 5
+    // O2: @llvm.lifetime.end(i64 5
+    char y[5];
+    bar(y, 5);
+    break;
+  }
+}
+
+// CHECK-LABEL: @switch_bypass
+void switch_bypass(int n) {
+  switch (n) {
+  case 1:
+    n = n;
+    // O2-NOT: @llvm.lifetime.start(i64 1
+    // O2-NOT: @llvm.lifetime.end(i64 1
+    char x;
+    bar(&x, 1);
+    break;
+  case 2:
+    bar(&x, 1);
+    break;
+  }
+}
+
+// CHECK-LABEL: @indirect_jump
+void indirect_jump(int n) {
+  char x;
+  // O2-NOT: @llvm.lifetime
+  void *T[] = {&&L};
+  goto *T[n];
+L:
+  bar(&x, 1);
+}
diff --git a/test/CodeGen/lzcnt-builtins.c b/test/CodeGen/lzcnt-builtins.c
index 2f8308670bb7..cc5d458c7630 100644
--- a/test/CodeGen/lzcnt-builtins.c
+++ b/test/CodeGen/lzcnt-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/mangle-blocks.c b/test/CodeGen/mangle-blocks.c
index 0023f53b3490..e8de92d8b400 100644
--- a/test/CodeGen/mangle-blocks.c
+++ b/test/CodeGen/mangle-blocks.c
@@ -11,13 +11,13 @@ void (^mangle(void))(void) {
   };
 }
 
-// CHECK: @__func__.__mangle_block_invoke_2 = private unnamed_addr constant [24 x i8] c"__mangle_block_invoke_2\00", align 1
+// CHECK: @__func__.__mangle_block_invoke_2 = private unnamed_addr constant [22 x i8] c"mangle_block_invoke_2\00", align 1
 // CHECK: @.str = private unnamed_addr constant {{.*}}, align 1
 // CHECK: @.str.1 = private unnamed_addr constant [7 x i8] c"mangle\00", align 1
 
 // CHECK: define internal void @__mangle_block_invoke(i8* %.block_descriptor)
 
 // CHECK: define internal void @__mangle_block_invoke_2(i8* %.block_descriptor){{.*}}{
-// CHECK:   call void @__assert_rtn(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @__func__.__mangle_block_invoke_2, i32 0, i32 0), i8* getelementptr inbounds {{.*}}, i32 9, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.1, i32 0, i32 0))
+// CHECK:   call void @__assert_rtn(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @__func__.__mangle_block_invoke_2, i32 0, i32 0), i8* getelementptr inbounds {{.*}}, i32 9, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.1, i32 0, i32 0))
 // CHECK: }
 
diff --git a/test/CodeGen/may-alias.c b/test/CodeGen/may-alias.c
index 520b7abea6c1..41eda8587068 100644
--- a/test/CodeGen/may-alias.c
+++ b/test/CodeGen/may-alias.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 -no-struct-path-tbaa -disable-llvm-optzns -o - %s | FileCheck %s
-// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 -disable-llvm-optzns -o - %s | FileCheck %s -check-prefix=PATH
+// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 -no-struct-path-tbaa -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 -disable-llvm-passes -o - %s | FileCheck %s -check-prefix=PATH
 
 // Types with the may_alias attribute should be considered equivalent
 // to char for aliasing.
diff --git a/test/CodeGen/mcount.c b/test/CodeGen/mcount.c
index 99dc90795b5e..7f915841952e 100644
--- a/test/CodeGen/mcount.c
+++ b/test/CodeGen/mcount.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -pg -triple i386-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -pg -triple i386-unknown-unknown -emit-llvm -O2 -o - %s | FileCheck %s
 // RUN: %clang_cc1 -pg -triple powerpc-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-PREFIXED %s
 // RUN: %clang_cc1 -pg -triple powerpc64-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-PREFIXED %s
 // RUN: %clang_cc1 -pg -triple powerpc64le-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-PREFIXED %s
@@ -12,7 +13,20 @@
 // RUN: %clang_cc1 -pg -triple powerpc64le-netbsd -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-PREFIXED %s
 // RUN: %clang_cc1 -pg -triple sparc-netbsd -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-PREFIXED %s
 // RUN: %clang_cc1 -pg -triple sparc64-netbsd -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-PREFIXED %s
-void foo(void) {
-// CHECK: call void @mcount()
-// CHECK-PREFIXED: call void @_mcount()
+// RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s -check-prefix=NO-MCOUNT
+
+int bar(void) {
+  return 0;
 }
+
+int foo(void) {
+  return bar();
+}
+
+int main(void) {
+  return foo();
+}
+
+// CHECK: attributes #{{[0-9]+}} = { {{.*}}"counting-function"="mcount"{{.*}} }
+// CHECK-PREFIXED: attributes #{{[0-9]+}} = { {{.*}}"counting-function"="_mcount"{{.*}} }
+// NO-MCOUNT-NOT: attributes #{{[0-9]}} = { {{.*}}"counting-function"={{.*}} }
diff --git a/test/CodeGen/mips16-attr.c b/test/CodeGen/mips16-attr.c
index 18799be6f0d7..128a1bdb8ebd 100644
--- a/test/CodeGen/mips16-attr.c
+++ b/test/CodeGen/mips16-attr.c
@@ -11,7 +11,7 @@ void __attribute__((nomips16)) nofoo (void) {
 
 // CHECK: define void @nofoo() [[NOMIPS16:#[0-9]+]]
 
-// CHECK: attributes [[MIPS16]] = { nounwind {{.*}} "mips16" {{.*}} }
+// CHECK: attributes [[MIPS16]] = { noinline nounwind {{.*}} "mips16" {{.*}} }
 
-// CHECK: attributes [[NOMIPS16]]  = { nounwind {{.*}} "nomips16" {{.*}} }
+// CHECK: attributes [[NOMIPS16]]  = { noinline nounwind {{.*}} "nomips16" {{.*}} }
 
diff --git a/test/CodeGen/mmx-builtins.c b/test/CodeGen/mmx-builtins.c
index 2bf497d58aaf..ddc6f66548ae 100644
--- a/test/CodeGen/mmx-builtins.c
+++ b/test/CodeGen/mmx-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/mozilla-ms-inline-asm.c b/test/CodeGen/mozilla-ms-inline-asm.c
index b8b7a2d677dd..3335a87fefe5 100644
--- a/test/CodeGen/mozilla-ms-inline-asm.c
+++ b/test/CodeGen/mozilla-ms-inline-asm.c
@@ -20,7 +20,7 @@ void invoke(void* that, unsigned methodIndex,
 // CHECK: call void asm sideeffect inteldialect
 // CHECK: mov edx,dword ptr $1
 // CHECK: test edx,edx
-// CHECK: jz {{[^_]*}}__MSASMLABEL_.0__noparams
+// CHECK: jz {{[^_]*}}__MSASMLABEL_.${:uid}__noparams
 //             ^ Can't use {{.*}} here because the matching is greedy.
 // CHECK: mov eax,edx
 // CHECK: shl eax,$$3
@@ -28,7 +28,7 @@ void invoke(void* that, unsigned methodIndex,
 // CHECK: mov ecx,esp
 // CHECK: push dword ptr $0
 // CHECK: call dword ptr $2
-// CHECK: {{.*}}__MSASMLABEL_.0__noparams:
+// CHECK: {{.*}}__MSASMLABEL_.${:uid}__noparams:
 // CHECK: mov ecx,dword ptr $3
 // CHECK: push ecx
 // CHECK: mov edx,[ecx]
diff --git a/test/CodeGen/mrtd.c b/test/CodeGen/mrtd.c
index 8d2aeeec49df..e526e0ba0041 100644
--- a/test/CodeGen/mrtd.c
+++ b/test/CodeGen/mrtd.c
@@ -25,4 +25,4 @@ void quux(int a1, int a2, int a3) {
 // CHECK-LABEL: define x86_stdcallcc void @quux
 // CHECK: call void (i32, ...) @qux
 
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/ms-barriers-intrinsics.c b/test/CodeGen/ms-barriers-intrinsics.c
new file mode 100644
index 000000000000..b0dfc3042a6e
--- /dev/null
+++ b/test/CodeGen/ms-barriers-intrinsics.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple i686--windows -emit-llvm %s -o - \
+// RUN:         | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-I386
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple x86_64--windows -emit-llvm %s -o - \
+// RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X64
+
+// intrin.h needs size_t, but -ffreestanding prevents us from getting it from
+// stddef.h.  Work around it with this typedef.
+typedef __SIZE_TYPE__ size_t;
+
+#include <intrin.h>
+
+void test_ReadWriteBarrier() { _ReadWriteBarrier(); }
+// CHECK-LABEL: define void @test_ReadWriteBarrier
+// CHECK:   fence singlethread seq_cst
+// CHECK:   ret void
+// CHECK: }
+
+void test_ReadBarrier() { _ReadBarrier(); }
+// CHECK-LABEL: define void @test_ReadBarrier
+// CHECK:   fence singlethread seq_cst
+// CHECK:   ret void
+// CHECK: }
+
+void test_WriteBarrier() { _WriteBarrier(); }
+// CHECK-LABEL: define void @test_WriteBarrier
+// CHECK:   fence singlethread seq_cst
+// CHECK:   ret void
+// CHECK: }
+
+#if defined(__x86_64__)
+void test__faststorefence() { __faststorefence(); }
+// CHECK-X64-LABEL: define void @test__faststorefence
+// CHECK-X64:   fence seq_cst
+// CHECK-X64:   ret void
+// CHECK-X64: }
+#endif
+
diff --git a/test/CodeGen/ms-declspecs.c b/test/CodeGen/ms-declspecs.c
index 91f5aa210485..4842050e5be5 100644
--- a/test/CodeGen/ms-declspecs.c
+++ b/test/CodeGen/ms-declspecs.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-pc-win32 %s -emit-llvm -fms-compatibility -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i386-pc-win32 %s -emit-llvm -fms-compatibility -O2 -disable-llvm-optzns -o - | FileCheck %s
 
 __declspec(selectany) int x1 = 1;
 const __declspec(selectany) int x2 = 2;
diff --git a/test/CodeGen/ms-inline-asm.c b/test/CodeGen/ms-inline-asm.c
index ad9e4f361b70..6efc09aec51d 100644
--- a/test/CodeGen/ms-inline-asm.c
+++ b/test/CodeGen/ms-inline-asm.c
@@ -47,7 +47,7 @@ void t6(void) {
 
 void t7() {
   __asm {
-    int 0x2c ; } asm comments are fun! }{
+    int 0x2cU ; } asm comments are fun! }{
   }
   __asm {
     {
@@ -56,7 +56,7 @@ void t7() {
   }
   __asm {}
 // CHECK: t7
-// CHECK: call void asm sideeffect inteldialect "int $$0x2c", "~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call void asm sideeffect inteldialect "int $$0x2cU", "~{dirflag},~{fpsr},~{flags}"()
 // CHECK: call void asm sideeffect inteldialect "", "~{dirflag},~{fpsr},~{flags}"()
 }
 
@@ -171,8 +171,8 @@ void t17() {
 // CHECK: t17
   __asm _emit 0x4A
 // CHECK: .byte 0x4A
-  __asm _emit 0x43
-// CHECK: .byte 0x43
+  __asm _emit 0x43L
+// CHECK: .byte 0x43L
   __asm _emit 0x4B
 // CHECK: .byte 0x4B
   __asm _EMIT 0x4B
@@ -219,11 +219,11 @@ void t20() {
 void t21() {
   __asm {
     __asm push ebx
-    __asm mov ebx, 0x07
+    __asm mov ebx, 07H
     __asm pop ebx
   }
 // CHECK: t21
-// CHECK: call void asm sideeffect inteldialect "push ebx\0A\09mov ebx, $$0x07\0A\09pop ebx", "~{ebx},~{esp},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call void asm sideeffect inteldialect "push ebx\0A\09mov ebx, $$07H\0A\09pop ebx", "~{ebx},~{esp},~{dirflag},~{fpsr},~{flags}"()
 }
 
 extern void t22_helper(int x);
@@ -249,7 +249,7 @@ void t23() {
   the_label:
   }
 // CHECK: t23
-// CHECK: call void asm sideeffect inteldialect "{{.*}}__MSASMLABEL_.0__the_label:", "~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call void asm sideeffect inteldialect "{{.*}}__MSASMLABEL_.${:uid}__the_label:", "~{dirflag},~{fpsr},~{flags}"()
 }
 
 void t24_helper(void) {}
@@ -262,15 +262,15 @@ void t24() {
 void t25() {
 // CHECK: t25
   __asm mov eax, 0ffffffffh
-// CHECK: mov eax, $$4294967295
-  __asm mov eax, 0fh
+// CHECK: mov eax, $$0ffffffffh
+  __asm mov eax, 0fhU
 // CHECK: mov eax, $$15
   __asm mov eax, 0a2h
+// CHECK: mov eax, $$0a2h
+  __asm mov eax, 10100010b
+// CHECK: mov eax, $$10100010b
+  __asm mov eax, 10100010BU
 // CHECK: mov eax, $$162
-  __asm mov eax, 0xa2h
-// CHECK: mov eax, $$0xa2h
-  __asm mov eax, 0xa2
-// CHECK: mov eax, $$0xa2
 // CHECK: "~{eax},~{dirflag},~{fpsr},~{flags}"()
 }
 
@@ -595,7 +595,7 @@ void label1() {
     jmp label
   }
   // CHECK-LABEL: define void @label1()
-  // CHECK: call void asm sideeffect inteldialect "{{.*}}__MSASMLABEL_.1__label:\0A\09jmp {{.*}}__MSASMLABEL_.1__label", "~{dirflag},~{fpsr},~{flags}"() [[ATTR1:#[0-9]+]]
+  // CHECK: call void asm sideeffect inteldialect "{{.*}}__MSASMLABEL_.${:uid}__label:\0A\09jmp {{.*}}__MSASMLABEL_.${:uid}__label", "~{dirflag},~{fpsr},~{flags}"() [[ATTR1:#[0-9]+]]
 }
 
 void label2() {
@@ -604,7 +604,7 @@ void label2() {
     label:
   }
   // CHECK-LABEL: define void @label2
-  // CHECK: call void asm sideeffect inteldialect "jmp {{.*}}__MSASMLABEL_.2__label\0A\09{{.*}}__MSASMLABEL_.2__label:", "~{dirflag},~{fpsr},~{flags}"()
+  // CHECK: call void asm sideeffect inteldialect "jmp {{.*}}__MSASMLABEL_.${:uid}__label\0A\09{{.*}}__MSASMLABEL_.${:uid}__label:", "~{dirflag},~{fpsr},~{flags}"()
 }
 
 void label3() {
@@ -613,7 +613,7 @@ void label3() {
     mov eax, label
   }
   // CHECK-LABEL: define void @label3
-  // CHECK: call void asm sideeffect inteldialect "{{.*}}__MSASMLABEL_.3__label:\0A\09mov eax, {{.*}}__MSASMLABEL_.3__label", "~{eax},~{dirflag},~{fpsr},~{flags}"()
+  // CHECK: call void asm sideeffect inteldialect "{{.*}}__MSASMLABEL_.${:uid}__label:\0A\09mov eax, {{.*}}__MSASMLABEL_.${:uid}__label", "~{eax},~{dirflag},~{fpsr},~{flags}"()
 }
 
 void label4() {
@@ -622,7 +622,7 @@ void label4() {
     mov eax, [label]
   }
   // CHECK-LABEL: define void @label4
-  // CHECK: call void asm sideeffect inteldialect "{{.*}}__MSASMLABEL_.4__label:\0A\09mov eax, {{.*}}__MSASMLABEL_.4__label", "~{eax},~{dirflag},~{fpsr},~{flags}"()
+  // CHECK: call void asm sideeffect inteldialect "{{.*}}__MSASMLABEL_.${:uid}__label:\0A\09mov eax, {{.*}}__MSASMLABEL_.${:uid}__label", "~{eax},~{dirflag},~{fpsr},~{flags}"()
 }
 
 void label5() {
@@ -631,7 +631,16 @@ void label5() {
     dollar_label$:
   }
   // CHECK-LABEL: define void @label5
-  // CHECK: call void asm sideeffect inteldialect "jmp {{.*}}__MSASMLABEL_.5__dollar_label$$\0A\09{{.*}}__MSASMLABEL_.5__dollar_label$$:", "~{dirflag},~{fpsr},~{flags}"()
+  // CHECK: call void asm sideeffect inteldialect "jmp {{.*}}__MSASMLABEL_.${:uid}__dollar_label$$\0A\09{{.*}}__MSASMLABEL_.${:uid}__dollar_label$$:", "~{dirflag},~{fpsr},~{flags}"()
+}
+
+void label6(){
+  __asm {
+      jmp short label
+    label:
+  }
+  // CHECK-LABEL: define void @label6
+  // CHECK: call void asm sideeffect inteldialect "jmp {{.*}}__MSASMLABEL_.${:uid}__label\0A\09{{.*}}__MSASMLABEL_.${:uid}__label:", "~{dirflag},~{fpsr},~{flags}"()
 }
 
 typedef union _LARGE_INTEGER {
@@ -653,4 +662,6 @@ int test_indirect_field(LARGE_INTEGER LargeInteger) {
 // CHECK: call i32 asm sideeffect inteldialect "mov eax, dword ptr $1",
 
 // MS ASM containing labels must not be duplicated (PR23715).
-// CHECK: attributes [[ATTR1]] = { {{.*}}noduplicate{{.*}} }
+// CHECK: attributes [[ATTR1]] = {
+// CHECK-NOT: noduplicate
+// CHECK-SAME: }{{$}}
diff --git a/test/CodeGen/ms-inline-asm.cpp b/test/CodeGen/ms-inline-asm.cpp
index 123a0e3d6d3d..424c1992cc2c 100644
--- a/test/CodeGen/ms-inline-asm.cpp
+++ b/test/CodeGen/ms-inline-asm.cpp
@@ -126,14 +126,14 @@ void t7_using() {
 void t8() {
   __asm some_label:
   // CHECK-LABEL: define void @_Z2t8v()
-  // CHECK: call void asm sideeffect inteldialect "L__MSASMLABEL_.1__some_label:", "~{dirflag},~{fpsr},~{flags}"()
+  // CHECK: call void asm sideeffect inteldialect "L__MSASMLABEL_.${:uid}__some_label:", "~{dirflag},~{fpsr},~{flags}"()
   struct A {
     static void g() {
       __asm jmp some_label ; This should jump forwards
       __asm some_label:
       __asm nop
       // CHECK-LABEL: define internal void @_ZZ2t8vEN1A1gEv()
-      // CHECK: call void asm sideeffect inteldialect "jmp L__MSASMLABEL_.2__some_label\0A\09L__MSASMLABEL_.2__some_label:\0A\09nop", "~{dirflag},~{fpsr},~{flags}"()
+      // CHECK: call void asm sideeffect inteldialect "jmp L__MSASMLABEL_.${:uid}__some_label\0A\09L__MSASMLABEL_.${:uid}__some_label:\0A\09nop", "~{dirflag},~{fpsr},~{flags}"()
     }
   };
   A::g();
diff --git a/test/CodeGen/ms-intrinsics-rotations.c b/test/CodeGen/ms-intrinsics-rotations.c
new file mode 100644
index 000000000000..65d25cbe85eb
--- /dev/null
+++ b/test/CodeGen/ms-intrinsics-rotations.c
@@ -0,0 +1,181 @@
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple i686--windows -emit-llvm %s -o - \
+// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple thumbv7--windows -emit-llvm %s -o - \
+// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple x86_64--windows -emit-llvm %s -o - \
+// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple i686--linux -emit-llvm %s -o - \
+// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple x86_64--linux -emit-llvm %s -o - \
+// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-64BIT-LONG
+
+// rotate left
+
+unsigned char test_rotl8(unsigned char value, unsigned char shift) {
+  return _rotl8(value, shift);
+}
+// CHECK: i8 @test_rotl8
+// CHECK:   [[SHIFT:%[0-9]+]] = and i8 %{{[0-9]+}}, 7
+// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i8 8, [[SHIFT]]
+// CHECK:   [[HIGH:%[0-9]+]] = shl i8 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK:   [[LOW:%[0-9]+]] = lshr i8 [[VALUE]], [[NEGSHIFT]]
+// CHECK:   [[ROTATED:%[0-9]+]] = or i8 [[HIGH]], [[LOW]]
+// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i8 [[SHIFT]], 0
+// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i8 [[VALUE]], i8 [[ROTATED]]
+// CHECK:   ret i8 [[RESULT]]
+// CHECK  }
+
+unsigned short test_rotl16(unsigned short value, unsigned char shift) {
+  return _rotl16(value, shift);
+}
+// CHECK: i16 @test_rotl16
+// CHECK:   [[SHIFT:%[0-9]+]] = and i16 %{{[0-9]+}}, 15
+// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i16 16, [[SHIFT]]
+// CHECK:   [[HIGH:%[0-9]+]] = shl i16 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK:   [[LOW:%[0-9]+]] = lshr i16 [[VALUE]], [[NEGSHIFT]]
+// CHECK:   [[ROTATED:%[0-9]+]] = or i16 [[HIGH]], [[LOW]]
+// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i16 [[SHIFT]], 0
+// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i16 [[VALUE]], i16 [[ROTATED]]
+// CHECK:   ret i16 [[RESULT]]
+// CHECK  }
+
+unsigned int test_rotl(unsigned int value, int shift) {
+  return _rotl(value, shift);
+}
+// CHECK: i32 @test_rotl
+// CHECK:   [[SHIFT:%[0-9]+]] = and i32 %{{[0-9]+}}, 31
+// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i32 32, [[SHIFT]]
+// CHECK:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE]], [[NEGSHIFT]]
+// CHECK:   [[ROTATED:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
+// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i32 [[SHIFT]], 0
+// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i32 [[VALUE]], i32 [[ROTATED]]
+// CHECK:   ret i32 [[RESULT]]
+// CHECK  }
+
+unsigned long test_lrotl(unsigned long value, int shift) {
+  return _lrotl(value, shift);
+}
+// CHECK-32BIT-LONG: i32 @test_lrotl
+// CHECK-32BIT-LONG:   [[SHIFT:%[0-9]+]] = and i32 %{{[0-9]+}}, 31
+// CHECK-32BIT-LONG:   [[NEGSHIFT:%[0-9]+]] = sub i32 32, [[SHIFT]]
+// CHECK-32BIT-LONG:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK-32BIT-LONG:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE]], [[NEGSHIFT]]
+// CHECK-32BIT-LONG:   [[ROTATED:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
+// CHECK-32BIT-LONG:   [[ISZERO:%[0-9]+]] = icmp eq i32 [[SHIFT]], 0
+// CHECK-32BIT-LONG:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i32 [[VALUE]], i32 [[ROTATED]]
+// CHECK-32BIT-LONG:   ret i32 [[RESULT]]
+// CHECK-32BIT-LONG  }
+
+// CHECK-64BIT-LONG: i64 @test_lrotl
+// CHECK-64BIT-LONG:   [[SHIFT:%[0-9]+]] = and i64 %{{[0-9]+}}, 63
+// CHECK-64BIT-LONG:   [[NEGSHIFT:%[0-9]+]] = sub i64 64, [[SHIFT]]
+// CHECK-64BIT-LONG:   [[HIGH:%[0-9]+]] = shl i64 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK-64BIT-LONG:   [[LOW:%[0-9]+]] = lshr i64 [[VALUE]], [[NEGSHIFT]]
+// CHECK-64BIT-LONG:   [[ROTATED:%[0-9]+]] = or i64 [[HIGH]], [[LOW]]
+// CHECK-64BIT-LONG:   [[ISZERO:%[0-9]+]] = icmp eq i64 [[SHIFT]], 0
+// CHECK-64BIT-LONG:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i64 [[VALUE]], i64 [[ROTATED]]
+// CHECK-64BIT-LONG:   ret i64 [[RESULT]]
+// CHECK-64BIT-LONG  }
+
+unsigned __int64 test_rotl64(unsigned __int64 value, int shift) {
+  return _rotl64(value, shift);
+}
+// CHECK: i64 @test_rotl64
+// CHECK:   [[SHIFT:%[0-9]+]] = and i64 %{{[0-9]+}}, 63
+// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i64 64, [[SHIFT]]
+// CHECK:   [[HIGH:%[0-9]+]] = shl i64 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK:   [[LOW:%[0-9]+]] = lshr i64 [[VALUE]], [[NEGSHIFT]]
+// CHECK:   [[ROTATED:%[0-9]+]] = or i64 [[HIGH]], [[LOW]]
+// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i64 [[SHIFT]], 0
+// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i64 [[VALUE]], i64 [[ROTATED]]
+// CHECK:   ret i64 [[RESULT]]
+// CHECK  }
+
+// rotate right
+
+unsigned char test_rotr8(unsigned char value, unsigned char shift) {
+  return _rotr8(value, shift);
+}
+// CHECK: i8 @test_rotr8
+// CHECK:   [[SHIFT:%[0-9]+]] = and i8 %{{[0-9]+}}, 7
+// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i8 8, [[SHIFT]]
+// CHECK:   [[LOW:%[0-9]+]] = lshr i8 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK:   [[HIGH:%[0-9]+]] = shl i8 [[VALUE]], [[NEGSHIFT]]
+// CHECK:   [[ROTATED:%[0-9]+]] = or i8 [[HIGH]], [[LOW]]
+// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i8 [[SHIFT]], 0
+// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i8 [[VALUE]], i8 [[ROTATED]]
+// CHECK:   ret i8 [[RESULT]]
+// CHECK  }
+
+unsigned short test_rotr16(unsigned short value, unsigned char shift) {
+  return _rotr16(value, shift);
+}
+// CHECK: i16 @test_rotr16
+// CHECK:   [[SHIFT:%[0-9]+]] = and i16 %{{[0-9]+}}, 15
+// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i16 16, [[SHIFT]]
+// CHECK:   [[LOW:%[0-9]+]] = lshr i16 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK:   [[HIGH:%[0-9]+]] = shl i16 [[VALUE]], [[NEGSHIFT]]
+// CHECK:   [[ROTATED:%[0-9]+]] = or i16 [[HIGH]], [[LOW]]
+// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i16 [[SHIFT]], 0
+// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i16 [[VALUE]], i16 [[ROTATED]]
+// CHECK:   ret i16 [[RESULT]]
+// CHECK  }
+
+unsigned int test_rotr(unsigned int value, int shift) {
+  return _rotr(value, shift);
+}
+// CHECK: i32 @test_rotr
+// CHECK:   [[SHIFT:%[0-9]+]] = and i32 %{{[0-9]+}}, 31
+// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i32 32, [[SHIFT]]
+// CHECK:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE]], [[NEGSHIFT]]
+// CHECK:   [[ROTATED:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
+// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i32 [[SHIFT]], 0
+// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i32 [[VALUE]], i32 [[ROTATED]]
+// CHECK:   ret i32 [[RESULT]]
+// CHECK  }
+
+unsigned long test_lrotr(unsigned long value, int shift) {
+  return _lrotr(value, shift);
+}
+// CHECK-32BIT-LONG: i32 @test_lrotr
+// CHECK-32BIT-LONG:   [[SHIFT:%[0-9]+]] = and i32 %{{[0-9]+}}, 31
+// CHECK-32BIT-LONG:   [[NEGSHIFT:%[0-9]+]] = sub i32 32, [[SHIFT]]
+// CHECK-32BIT-LONG:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK-32BIT-LONG:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE]], [[NEGSHIFT]]
+// CHECK-32BIT-LONG:   [[ROTATED:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
+// CHECK-32BIT-LONG:   [[ISZERO:%[0-9]+]] = icmp eq i32 [[SHIFT]], 0
+// CHECK-32BIT-LONG:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i32 [[VALUE]], i32 [[ROTATED]]
+// CHECK-32BIT-LONG:   ret i32 [[RESULT]]
+// CHECK-32BIT-LONG  }
+
+// CHECK-64BIT-LONG: i64 @test_lrotr
+// CHECK-64BIT-LONG:   [[SHIFT:%[0-9]+]] = and i64 %{{[0-9]+}}, 63
+// CHECK-64BIT-LONG:   [[NEGSHIFT:%[0-9]+]] = sub i64 64, [[SHIFT]]
+// CHECK-64BIT-LONG:   [[LOW:%[0-9]+]] = lshr i64 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK-64BIT-LONG:   [[HIGH:%[0-9]+]] = shl i64 [[VALUE]], [[NEGSHIFT]]
+// CHECK-64BIT-LONG:   [[ROTATED:%[0-9]+]] = or i64 [[HIGH]], [[LOW]]
+// CHECK-64BIT-LONG:   [[ISZERO:%[0-9]+]] = icmp eq i64 [[SHIFT]], 0
+// CHECK-64BIT-LONG:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i64 [[VALUE]], i64 [[ROTATED]]
+// CHECK-64BIT-LONG:   ret i64 [[RESULT]]
+// CHECK-64BIT-LONG  }
+
+unsigned __int64 test_rotr64(unsigned __int64 value, int shift) {
+  return _rotr64(value, shift);
+}
+// CHECK: i64 @test_rotr64
+// CHECK:   [[SHIFT:%[0-9]+]] = and i64 %{{[0-9]+}}, 63
+// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i64 64, [[SHIFT]]
+// CHECK:   [[LOW:%[0-9]+]] = lshr i64 [[VALUE:%[0-9]+]], [[SHIFT]]
+// CHECK:   [[HIGH:%[0-9]+]] = shl i64 [[VALUE]], [[NEGSHIFT]]
+// CHECK:   [[ROTATED:%[0-9]+]] = or i64 [[HIGH]], [[LOW]]
+// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i64 [[SHIFT]], 0
+// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i64 [[VALUE]], i64 [[ROTATED]]
+// CHECK:   ret i64 [[RESULT]]
+// CHECK  }
diff --git a/test/CodeGen/ms-intrinsics.c b/test/CodeGen/ms-intrinsics.c
index ceaa847e9987..25eae44e0272 100644
--- a/test/CodeGen/ms-intrinsics.c
+++ b/test/CodeGen/ms-intrinsics.c
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
 // RUN:         -triple i686--windows -Oz -emit-llvm %s -o - \
-// RUN:         | FileCheck %s -check-prefix CHECK -check-prefix CHECK-I386
+// RUN:         | FileCheck %s -check-prefixes CHECK,CHECK-I386,CHECK-INTEL
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
 // RUN:         -triple thumbv7--windows -Oz -emit-llvm %s -o - \
-// RUN:         | FileCheck %s
+// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-ARM-X64
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
 // RUN:         -triple x86_64--windows -Oz -emit-llvm %s -o - \
-// RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X64
+// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-X64,CHECK-ARM-X64,CHECK-INTEL
 
 // intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
@@ -14,6 +14,100 @@ typedef __SIZE_TYPE__ size_t;
 
 #include <intrin.h>
 
+#if defined(__i386__) || defined(__x86_64__)
+void test__stosb(unsigned char *Dest, unsigned char Data, size_t Count) {
+  return __stosb(Dest, Data, Count);
+}
+
+// CHECK-I386: define{{.*}}void @test__stosb
+// CHECK-I386:   tail call void @llvm.memset.p0i8.i32(i8* %Dest, i8 %Data, i32 %Count, i32 1, i1 true)
+// CHECK-I386:   ret void
+// CHECK-I386: }
+
+// CHECK-X64: define{{.*}}void @test__stosb
+// CHECK-X64:   tail call void @llvm.memset.p0i8.i64(i8* %Dest, i8 %Data, i64 %Count, i32 1, i1 true)
+// CHECK-X64:   ret void
+// CHECK-X64: }
+#endif
+
+void *test_ReturnAddress() {
+  return _ReturnAddress();
+}
+// CHECK-LABEL: define{{.*}}i8* @test_ReturnAddress()
+// CHECK: = tail call i8* @llvm.returnaddress(i32 0)
+// CHECK: ret i8*
+
+#if defined(__i386__) || defined(__x86_64__)
+void *test_AddressOfReturnAddress() {
+  return _AddressOfReturnAddress();
+}
+// CHECK-INTEL-LABEL: define i8* @test_AddressOfReturnAddress()
+// CHECK-INTEL: = tail call i8* @llvm.addressofreturnaddress()
+// CHECK-INTEL: ret i8*
+#endif
+
+unsigned char test_BitScanForward(unsigned long *Index, unsigned long Mask) {
+  return _BitScanForward(Index, Mask);
+}
+// CHECK: define{{.*}}i8 @test_BitScanForward(i32* {{[a-z_ ]*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK:   [[ISNOTZERO:%[a-z0-9._]+]] = icmp eq i32 %Mask, 0
+// CHECK:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
+// CHECK:   [[END_LABEL]]:
+// CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
+// CHECK:   ret i8 [[RESULT]]
+// CHECK:   [[ISNOTZERO_LABEL]]:
+// CHECK:   [[INDEX:%[0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %Mask, i1 true)
+// CHECK:   store i32 [[INDEX]], i32* %Index, align 4
+// CHECK:   br label %[[END_LABEL]]
+
+unsigned char test_BitScanReverse(unsigned long *Index, unsigned long Mask) {
+  return _BitScanReverse(Index, Mask);
+}
+// CHECK: define{{.*}}i8 @test_BitScanReverse(i32* {{[a-z_ ]*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK:   [[ISNOTZERO:%[0-9]+]] = icmp eq i32 %Mask, 0
+// CHECK:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
+// CHECK:   [[END_LABEL]]:
+// CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
+// CHECK:   ret i8 [[RESULT]]
+// CHECK:   [[ISNOTZERO_LABEL]]:
+// CHECK:   [[REVINDEX:%[0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %Mask, i1 true)
+// CHECK:   [[INDEX:%[0-9]+]] = xor i32 [[REVINDEX]], 31
+// CHECK:   store i32 [[INDEX]], i32* %Index, align 4
+// CHECK:   br label %[[END_LABEL]]
+
+#if defined(__x86_64__) || defined(__arm__)
+unsigned char test_BitScanForward64(unsigned long *Index, unsigned __int64 Mask) {
+  return _BitScanForward64(Index, Mask);
+}
+// CHECK-ARM-X64: define{{.*}}i8 @test_BitScanForward64(i32* {{[a-z_ ]*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK-ARM-X64:   [[ISNOTZERO:%[a-z0-9._]+]] = icmp eq i64 %Mask, 0
+// CHECK-ARM-X64:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
+// CHECK-ARM-X64:   [[END_LABEL]]:
+// CHECK-ARM-X64:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
+// CHECK-ARM-X64:   ret i8 [[RESULT]]
+// CHECK-ARM-X64:   [[ISNOTZERO_LABEL]]:
+// CHECK-ARM-X64:   [[INDEX:%[0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %Mask, i1 true)
+// CHECK-ARM-X64:   [[TRUNC_INDEX:%[0-9]+]] = trunc i64 [[INDEX]] to i32
+// CHECK-ARM-X64:   store i32 [[TRUNC_INDEX]], i32* %Index, align 4
+// CHECK-ARM-X64:   br label %[[END_LABEL]]
+
+unsigned char test_BitScanReverse64(unsigned long *Index, unsigned __int64 Mask) {
+  return _BitScanReverse64(Index, Mask);
+}
+// CHECK-ARM-X64: define{{.*}}i8 @test_BitScanReverse64(i32* {{[a-z_ ]*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK-ARM-X64:   [[ISNOTZERO:%[0-9]+]] = icmp eq i64 %Mask, 0
+// CHECK-ARM-X64:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
+// CHECK-ARM-X64:   [[END_LABEL]]:
+// CHECK-ARM-X64:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
+// CHECK-ARM-X64:   ret i8 [[RESULT]]
+// CHECK-ARM-X64:   [[ISNOTZERO_LABEL]]:
+// CHECK-ARM-X64:   [[REVINDEX:%[0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %Mask, i1 true)
+// CHECK-ARM-X64:   [[TRUNC_REVINDEX:%[0-9]+]] = trunc i64 [[REVINDEX]] to i32
+// CHECK-ARM-X64:   [[INDEX:%[0-9]+]] = xor i32 [[TRUNC_REVINDEX]], 63
+// CHECK-ARM-X64:   store i32 [[INDEX]], i32* %Index, align 4
+// CHECK-ARM-X64:   br label %[[END_LABEL]]
+#endif
+
 void *test_InterlockedExchangePointer(void * volatile *Target, void *Value) {
   return _InterlockedExchangePointer(Target, Value);
 }
@@ -41,33 +135,287 @@ void *test_InterlockedCompareExchangePointer(void * volatile *Destination,
 // CHECK:   ret i8* %[[RESULT:[0-9]+]]
 // CHECK: }
 
-long test_InterlockedExchange(long *Target, long Value) {
-  return _InterlockedExchange(Target, Value);
+char test_InterlockedExchange8(char volatile *value, char mask) {
+  return _InterlockedExchange8(value, mask);
 }
+// CHECK: define{{.*}}i8 @test_InterlockedExchange8(i8*{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xchg i8* %value, i8 %mask seq_cst
+// CHECK:   ret i8 [[RESULT:%[0-9]+]]
+// CHECK: }
 
-// CHECK: define{{.*}}i32 @test_InterlockedExchange(i32* {{[a-z_ ]*}}%Target, i32 %Value){{.*}}{
-// CHECK:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg i32* %Target, i32 %Value seq_cst
-// CHECK:   ret i32 %[[EXCHANGE:[0-9]+]]
+short test_InterlockedExchange16(short volatile *value, short mask) {
+  return _InterlockedExchange16(value, mask);
+}
+// CHECK: define{{.*}}i16 @test_InterlockedExchange16(i16*{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xchg i16* %value, i16 %mask seq_cst
+// CHECK:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK: }
 
-#if defined(__i386__)
-long test__readfsdword(unsigned long Offset) {
-  return __readfsdword(Offset);
+long test_InterlockedExchange(long volatile *value, long mask) {
+  return _InterlockedExchange(value, mask);
 }
+// CHECK: define{{.*}}i32 @test_InterlockedExchange(i32*{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xchg i32* %value, i32 %mask seq_cst
+// CHECK:   ret i32 [[RESULT:%[0-9]+]]
+// CHECK: }
 
-// CHECK-I386: define i32 @test__readfsdword(i32 %Offset){{.*}}{
-// CHECK-I386:   [[PTR:%[0-9]+]] = inttoptr i32 %Offset to i32 addrspace(257)*
-// CHECK-I386:   [[VALUE:%[0-9]+]] = load volatile i32, i32 addrspace(257)* [[PTR]], align 4
-// CHECK-I386:   ret i32 [[VALUE:%[0-9]+]]
-// CHECK-I386: }
-#endif
+char test_InterlockedExchangeAdd8(char volatile *value, char mask) {
+  return _InterlockedExchangeAdd8(value, mask);
+}
+// CHECK: define{{.*}}i8 @test_InterlockedExchangeAdd8(i8*{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw add i8* %value, i8 %mask seq_cst
+// CHECK:   ret i8 [[RESULT:%[0-9]+]]
+// CHECK: }
 
-#if defined(__x86_64__)
-unsigned __int64 test__umulh(unsigned __int64 a, unsigned __int64 b) {
-  return __umulh(a, b);
+short test_InterlockedExchangeAdd16(short volatile *value, short mask) {
+  return _InterlockedExchangeAdd16(value, mask);
 }
-// CHECK-X64-LABEL: define i64 @test__umulh(i64 %a, i64 %b)
-// CHECK-X64: = mul nuw i128 %
+// CHECK: define{{.*}}i16 @test_InterlockedExchangeAdd16(i16*{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw add i16* %value, i16 %mask seq_cst
+// CHECK:   ret i16 [[RESULT:%[0-9]+]]
+// CHECK: }
 
-#endif
+long test_InterlockedExchangeAdd(long volatile *value, long mask) {
+  return _InterlockedExchangeAdd(value, mask);
+}
+// CHECK: define{{.*}}i32 @test_InterlockedExchangeAdd(i32*{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw add i32* %value, i32 %mask seq_cst
+// CHECK:   ret i32 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+char test_InterlockedExchangeSub8(char volatile *value, char mask) {
+  return _InterlockedExchangeSub8(value, mask);
+}
+// CHECK: define{{.*}}i8 @test_InterlockedExchangeSub8(i8*{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw sub i8* %value, i8 %mask seq_cst
+// CHECK:   ret i8 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+short test_InterlockedExchangeSub16(short volatile *value, short mask) {
+  return _InterlockedExchangeSub16(value, mask);
+}
+// CHECK: define{{.*}}i16 @test_InterlockedExchangeSub16(i16*{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw sub i16* %value, i16 %mask seq_cst
+// CHECK:   ret i16 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+long test_InterlockedExchangeSub(long volatile *value, long mask) {
+  return _InterlockedExchangeSub(value, mask);
+}
+// CHECK: define{{.*}}i32 @test_InterlockedExchangeSub(i32*{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw sub i32* %value, i32 %mask seq_cst
+// CHECK:   ret i32 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+char test_InterlockedOr8(char volatile *value, char mask) {
+  return _InterlockedOr8(value, mask);
+}
+// CHECK: define{{.*}}i8 @test_InterlockedOr8(i8*{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw or i8* %value, i8 %mask seq_cst
+// CHECK:   ret i8 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+short test_InterlockedOr16(short volatile *value, short mask) {
+  return _InterlockedOr16(value, mask);
+}
+// CHECK: define{{.*}}i16 @test_InterlockedOr16(i16*{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw or i16* %value, i16 %mask seq_cst
+// CHECK:   ret i16 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+long test_InterlockedOr(long volatile *value, long mask) {
+  return _InterlockedOr(value, mask);
+}
+// CHECK: define{{.*}}i32 @test_InterlockedOr(i32*{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw or i32* %value, i32 %mask seq_cst
+// CHECK:   ret i32 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+char test_InterlockedXor8(char volatile *value, char mask) {
+  return _InterlockedXor8(value, mask);
+}
+// CHECK: define{{.*}}i8 @test_InterlockedXor8(i8*{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xor i8* %value, i8 %mask seq_cst
+// CHECK:   ret i8 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+short test_InterlockedXor16(short volatile *value, short mask) {
+  return _InterlockedXor16(value, mask);
+}
+// CHECK: define{{.*}}i16 @test_InterlockedXor16(i16*{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xor i16* %value, i16 %mask seq_cst
+// CHECK:   ret i16 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+long test_InterlockedXor(long volatile *value, long mask) {
+  return _InterlockedXor(value, mask);
+}
+// CHECK: define{{.*}}i32 @test_InterlockedXor(i32*{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xor i32* %value, i32 %mask seq_cst
+// CHECK:   ret i32 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+char test_InterlockedAnd8(char volatile *value, char mask) {
+  return _InterlockedAnd8(value, mask);
+}
+// CHECK: define{{.*}}i8 @test_InterlockedAnd8(i8*{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw and i8* %value, i8 %mask seq_cst
+// CHECK:   ret i8 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+short test_InterlockedAnd16(short volatile *value, short mask) {
+  return _InterlockedAnd16(value, mask);
+}
+// CHECK: define{{.*}}i16 @test_InterlockedAnd16(i16*{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw and i16* %value, i16 %mask seq_cst
+// CHECK:   ret i16 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+long test_InterlockedAnd(long volatile *value, long mask) {
+  return _InterlockedAnd(value, mask);
+}
+// CHECK: define{{.*}}i32 @test_InterlockedAnd(i32*{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK:   [[RESULT:%[0-9]+]] = atomicrmw and i32* %value, i32 %mask seq_cst
+// CHECK:   ret i32 [[RESULT:%[0-9]+]]
+// CHECK: }
+
+char test_InterlockedCompareExchange8(char volatile *Destination, char Exchange, char Comperand) {
+  return _InterlockedCompareExchange8(Destination, Exchange, Comperand);
+}
+// CHECK: define{{.*}}i8 @test_InterlockedCompareExchange8(i8*{{[a-z_ ]*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK: [[TMP:%[0-9]+]] = cmpxchg volatile i8* %Destination, i8 %Comperand, i8 %Exchange seq_cst seq_cst
+// CHECK: [[RESULT:%[0-9]+]] = extractvalue { i8, i1 } [[TMP]], 0
+// CHECK: ret i8 [[RESULT]]
+// CHECK: }
+
+short test_InterlockedCompareExchange16(short volatile *Destination, short Exchange, short Comperand) {
+  return _InterlockedCompareExchange16(Destination, Exchange, Comperand);
+}
+// CHECK: define{{.*}}i16 @test_InterlockedCompareExchange16(i16*{{[a-z_ ]*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK: [[TMP:%[0-9]+]] = cmpxchg volatile i16* %Destination, i16 %Comperand, i16 %Exchange seq_cst seq_cst
+// CHECK: [[RESULT:%[0-9]+]] = extractvalue { i16, i1 } [[TMP]], 0
+// CHECK: ret i16 [[RESULT]]
+// CHECK: }
+
+long test_InterlockedCompareExchange(long volatile *Destination, long Exchange, long Comperand) {
+  return _InterlockedCompareExchange(Destination, Exchange, Comperand);
+}
+// CHECK: define{{.*}}i32 @test_InterlockedCompareExchange(i32*{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK: [[TMP:%[0-9]+]] = cmpxchg volatile i32* %Destination, i32 %Comperand, i32 %Exchange seq_cst seq_cst
+// CHECK: [[RESULT:%[0-9]+]] = extractvalue { i32, i1 } [[TMP]], 0
+// CHECK: ret i32 [[RESULT]]
+// CHECK: }
+
+__int64 test_InterlockedCompareExchange64(__int64 volatile *Destination, __int64 Exchange, __int64 Comperand) {
+  return _InterlockedCompareExchange64(Destination, Exchange, Comperand);
+}
+// CHECK: define{{.*}}i64 @test_InterlockedCompareExchange64(i64*{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK: [[TMP:%[0-9]+]] = cmpxchg volatile i64* %Destination, i64 %Comperand, i64 %Exchange seq_cst seq_cst
+// CHECK: [[RESULT:%[0-9]+]] = extractvalue { i64, i1 } [[TMP]], 0
+// CHECK: ret i64 [[RESULT]]
+// CHECK: }
+
+short test_InterlockedIncrement16(short volatile *Addend) {
+  return _InterlockedIncrement16(Addend);
+}
+// CHECK: define{{.*}}i16 @test_InterlockedIncrement16(i16*{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: [[TMP:%[0-9]+]] = atomicrmw add i16* %Addend, i16 1 seq_cst
+// CHECK: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1
+// CHECK: ret i16 [[RESULT]]
+// CHECK: }
+
+long test_InterlockedIncrement(long volatile *Addend) {
+  return _InterlockedIncrement(Addend);
+}
+// CHECK: define{{.*}}i32 @test_InterlockedIncrement(i32*{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: [[TMP:%[0-9]+]] = atomicrmw add i32* %Addend, i32 1 seq_cst
+// CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
+// CHECK: ret i32 [[RESULT]]
+// CHECK: }
+
+short test_InterlockedDecrement16(short volatile *Addend) {
+  return _InterlockedDecrement16(Addend);
+}
+// CHECK: define{{.*}}i16 @test_InterlockedDecrement16(i16*{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: [[TMP:%[0-9]+]] = atomicrmw sub i16* %Addend, i16 1 seq_cst
+// CHECK: [[RESULT:%[0-9]+]] = add i16 [[TMP]], -1
+// CHECK: ret i16 [[RESULT]]
+// CHECK: }
+
+long test_InterlockedDecrement(long volatile *Addend) {
+  return _InterlockedDecrement(Addend);
+}
+// CHECK: define{{.*}}i32 @test_InterlockedDecrement(i32*{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: [[TMP:%[0-9]+]] = atomicrmw sub i32* %Addend, i32 1 seq_cst
+// CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
+// CHECK: ret i32 [[RESULT]]
+// CHECK: }
+
+#if defined(__x86_64__) || defined(__arm__)
+__int64 test_InterlockedExchange64(__int64 volatile *value, __int64 mask) {
+  return _InterlockedExchange64(value, mask);
+}
+// CHECK-ARM-X64: define{{.*}}i64 @test_InterlockedExchange64(i64*{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-X64:   [[RESULT:%[0-9]+]] = atomicrmw xchg i64* %value, i64 %mask seq_cst
+// CHECK-ARM-X64:   ret i64 [[RESULT:%[0-9]+]]
+// CHECK-ARM-X64: }
+
+__int64 test_InterlockedExchangeAdd64(__int64 volatile *value, __int64 mask) {
+  return _InterlockedExchangeAdd64(value, mask);
+}
+// CHECK-ARM-X64: define{{.*}}i64 @test_InterlockedExchangeAdd64(i64*{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-X64:   [[RESULT:%[0-9]+]] = atomicrmw add i64* %value, i64 %mask seq_cst
+// CHECK-ARM-X64:   ret i64 [[RESULT:%[0-9]+]]
+// CHECK-ARM-X64: }
 
+__int64 test_InterlockedExchangeSub64(__int64 volatile *value, __int64 mask) {
+  return _InterlockedExchangeSub64(value, mask);
+}
+// CHECK-ARM-X64: define{{.*}}i64 @test_InterlockedExchangeSub64(i64*{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-X64:   [[RESULT:%[0-9]+]] = atomicrmw sub i64* %value, i64 %mask seq_cst
+// CHECK-ARM-X64:   ret i64 [[RESULT:%[0-9]+]]
+// CHECK-ARM-X64: }
+
+__int64 test_InterlockedOr64(__int64 volatile *value, __int64 mask) {
+  return _InterlockedOr64(value, mask);
+}
+// CHECK-ARM-X64: define{{.*}}i64 @test_InterlockedOr64(i64*{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-X64:   [[RESULT:%[0-9]+]] = atomicrmw or i64* %value, i64 %mask seq_cst
+// CHECK-ARM-X64:   ret i64 [[RESULT:%[0-9]+]]
+// CHECK-ARM-X64: }
+
+__int64 test_InterlockedXor64(__int64 volatile *value, __int64 mask) {
+  return _InterlockedXor64(value, mask);
+}
+// CHECK-ARM-X64: define{{.*}}i64 @test_InterlockedXor64(i64*{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-X64:   [[RESULT:%[0-9]+]] = atomicrmw xor i64* %value, i64 %mask seq_cst
+// CHECK-ARM-X64:   ret i64 [[RESULT:%[0-9]+]]
+// CHECK-ARM-X64: }
+
+__int64 test_InterlockedAnd64(__int64 volatile *value, __int64 mask) {
+  return _InterlockedAnd64(value, mask);
+}
+// CHECK-ARM-X64: define{{.*}}i64 @test_InterlockedAnd64(i64*{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-X64:   [[RESULT:%[0-9]+]] = atomicrmw and i64* %value, i64 %mask seq_cst
+// CHECK-ARM-X64:   ret i64 [[RESULT:%[0-9]+]]
+// CHECK-ARM-X64: }
+
+__int64 test_InterlockedIncrement64(__int64 volatile *Addend) {
+  return _InterlockedIncrement64(Addend);
+}
+// CHECK-ARM-X64: define{{.*}}i64 @test_InterlockedIncrement64(i64*{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-X64: [[TMP:%[0-9]+]] = atomicrmw add i64* %Addend, i64 1 seq_cst
+// CHECK-ARM-X64: [[RESULT:%[0-9]+]] = add i64 [[TMP]], 1
+// CHECK-ARM-X64: ret i64 [[RESULT]]
+// CHECK-ARM-X64: }
+
+__int64 test_InterlockedDecrement64(__int64 volatile *Addend) {
+  return _InterlockedDecrement64(Addend);
+}
+// CHECK-ARM-X64: define{{.*}}i64 @test_InterlockedDecrement64(i64*{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-X64: [[TMP:%[0-9]+]] = atomicrmw sub i64* %Addend, i64 1 seq_cst
+// CHECK-ARM-X64: [[RESULT:%[0-9]+]] = add i64 [[TMP]], -1
+// CHECK-ARM-X64: ret i64 [[RESULT]]
+// CHECK-ARM-X64: }
+
+#endif
diff --git a/test/CodeGen/ms-mm-align.c b/test/CodeGen/ms-mm-align.c
index 7130c74b2905..f3ed49dcd5d6 100644
--- a/test/CodeGen/ms-mm-align.c
+++ b/test/CodeGen/ms-mm-align.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -target-feature +sse \
 // RUN:         -triple i686--windows -emit-llvm %s -o - \
-// RUN:         | FileCheck %s -check-prefix CHECK
+// RUN:         | FileCheck %s
 
 // intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
diff --git a/test/CodeGen/ms-volatile-arm.c b/test/CodeGen/ms-volatile-arm.c
new file mode 100644
index 000000000000..065e624a4c04
--- /dev/null
+++ b/test/CodeGen/ms-volatile-arm.c
@@ -0,0 +1,13 @@
+// REQUIRES: arm-registered-target
+// RUN: %clang_cc1 -triple thumbv7-win32 -emit-llvm -fms-extensions -fms-volatile -o - < %s | FileCheck %s
+
+void test1(int volatile *p, int v) {
+  __iso_volatile_store32(p, v);
+  // CHECK-LABEL: @test1
+  // CHECK: store volatile {{.*}}, {{.*}}
+}
+int test2(const int volatile *p) {
+  return __iso_volatile_load32(p);
+  // CHECK-LABEL: @test2
+  // CHECK: load volatile {{.*}}
+}
diff --git a/test/CodeGen/ms-x86-intrinsics.c b/test/CodeGen/ms-x86-intrinsics.c
new file mode 100644
index 000000000000..e635220e8c13
--- /dev/null
+++ b/test/CodeGen/ms-x86-intrinsics.c
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple i686--windows -Oz -emit-llvm %s -o - \
+// RUN:         | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-I386
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple x86_64--windows -Oz -emit-llvm %s -o - \
+// RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X64
+
+#if defined(__i386__)
+long test__readfsdword(unsigned long Offset) {
+  return __readfsdword(Offset);
+}
+
+// CHECK-I386-LABEL: define i32 @test__readfsdword(i32 %Offset){{.*}}{
+// CHECK-I386:   [[PTR:%[0-9]+]] = inttoptr i32 %Offset to i32 addrspace(257)*
+// CHECK-I386:   [[VALUE:%[0-9]+]] = load volatile i32, i32 addrspace(257)* [[PTR]], align 4
+// CHECK-I386:   ret i32 [[VALUE:%[0-9]+]]
+// CHECK-I386: }
+#endif
+
+__int64 test__emul(int a, int b) {
+  return __emul(a, b);
+}
+// CHECK-LABEL: define i64 @test__emul(i32 %a, i32 %b)
+// CHECK: [[X:%[0-9]+]] = sext i32 %a to i64
+// CHECK: [[Y:%[0-9]+]] = sext i32 %b to i64
+// CHECK: [[RES:%[0-9]+]] = mul nsw i64 [[Y]], [[X]]
+// CHECK: ret i64 [[RES]]
+
+unsigned __int64 test__emulu(unsigned int a, unsigned int b) {
+  return __emulu(a, b);
+}
+// CHECK-LABEL: define i64 @test__emulu(i32 %a, i32 %b)
+// CHECK: [[X:%[0-9]+]] = zext i32 %a to i64
+// CHECK: [[Y:%[0-9]+]] = zext i32 %b to i64
+// CHECK: [[RES:%[0-9]+]] = mul nuw i64 [[Y]], [[X]]
+// CHECK: ret i64 [[RES]]
+
+#if defined(__x86_64__)
+__int64 test__mulh(__int64 a, __int64 b) {
+  return __mulh(a, b);
+}
+// CHECK-X64-LABEL: define i64 @test__mulh(i64 %a, i64 %b)
+// CHECK-X64: = mul nsw i128 %
+
+unsigned __int64 test__umulh(unsigned __int64 a, unsigned __int64 b) {
+  return __umulh(a, b);
+}
+// CHECK-X64-LABEL: define i64 @test__umulh(i64 %a, i64 %b)
+// CHECK-X64: = mul nuw i128 %
+
+__int64 test_mul128(__int64 Multiplier,
+                    __int64 Multiplicand,
+                    __int64 *HighProduct) {
+  return _mul128(Multiplier, Multiplicand, HighProduct);
+}
+// CHECK-X64-LABEL: define i64 @test_mul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z_ ]*}}%HighProduct)
+// CHECK-X64: = sext i64 %Multiplier to i128
+// CHECK-X64: = sext i64 %Multiplicand to i128
+// CHECK-X64: = mul nsw i128 %
+// CHECK-X64: store i64 %
+// CHECK-X64: ret i64 %
+
+unsigned __int64 test_umul128(unsigned __int64 Multiplier,
+                              unsigned __int64 Multiplicand,
+                              unsigned __int64 *HighProduct) {
+  return _umul128(Multiplier, Multiplicand, HighProduct);
+}
+// CHECK-X64-LABEL: define i64 @test_umul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z_ ]*}}%HighProduct)
+// CHECK-X64: = zext i64 %Multiplier to i128
+// CHECK-X64: = zext i64 %Multiplicand to i128
+// CHECK-X64: = mul nuw i128 %
+// CHECK-X64: store i64 %
+// CHECK-X64: ret i64 %
+#endif
diff --git a/test/CodeGen/ms_abi.c b/test/CodeGen/ms_abi.c
index 2cca24901b9a..97f66b29c517 100644
--- a/test/CodeGen/ms_abi.c
+++ b/test/CodeGen/ms_abi.c
@@ -45,24 +45,27 @@ void __attribute__((ms_abi)) f4(int a, ...) {
   // WIN64-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR]], i64 8
   // WIN64-NEXT: store i8* %[[AP_NEXT]], i8** %[[AP]]
   // WIN64-NEXT: bitcast i8* %[[AP_CUR]] to i32*
+  // FIXME: These are different now. We probably need __builtin_ms_va_arg.
   double _Complex c = __builtin_va_arg(ap, double _Complex);
   // FREEBSD: %[[AP_CUR2:.*]] = load i8*, i8** %[[AP]]
   // FREEBSD-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR2]], i64 16
   // FREEBSD-NEXT: store i8* %[[AP_NEXT2]], i8** %[[AP]]
   // FREEBSD-NEXT: bitcast i8* %[[AP_CUR2]] to { double, double }*
   // WIN64: %[[AP_CUR2:.*]] = load i8*, i8** %[[AP]]
-  // WIN64-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR2]], i64 16
+  // WIN64-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR2]], i64 8
   // WIN64-NEXT: store i8* %[[AP_NEXT2]], i8** %[[AP]]
-  // WIN64-NEXT: bitcast i8* %[[AP_CUR2]] to { double, double }*
+  // WIN64-NEXT: %[[CUR2:.*]] = bitcast i8* %[[AP_CUR2]] to { double, double }**
+  // WIN64-NEXT: load { double, double }*, { double, double }** %[[CUR2]]
   struct foo d = __builtin_va_arg(ap, struct foo);
   // FREEBSD: %[[AP_CUR3:.*]] = load i8*, i8** %[[AP]]
   // FREEBSD-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR3]], i64 16
   // FREEBSD-NEXT: store i8* %[[AP_NEXT3]], i8** %[[AP]]
   // FREEBSD-NEXT: bitcast i8* %[[AP_CUR3]] to %[[STRUCT_FOO]]*
   // WIN64: %[[AP_CUR3:.*]] = load i8*, i8** %[[AP]]
-  // WIN64-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR3]], i64 16
+  // WIN64-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR3]], i64 8
   // WIN64-NEXT: store i8* %[[AP_NEXT3]], i8** %[[AP]]
-  // WIN64-NEXT: bitcast i8* %[[AP_CUR3]] to %[[STRUCT_FOO]]*
+  // WIN64-NEXT: %[[CUR3:.*]] = bitcast i8* %[[AP_CUR3]] to %[[STRUCT_FOO]]*
+  // WIN64-NEXT: load %[[STRUCT_FOO]]*, %[[STRUCT_FOO]]** %[[CUR3]]
   __builtin_ms_va_list ap2;
   __builtin_ms_va_copy(ap2, ap);
   // FREEBSD: %[[AP_VAL:.*]] = load i8*, i8** %[[AP]]
@@ -88,12 +91,12 @@ void f5(int a, ...) {
   // WIN64-NEXT: bitcast i8* %[[AP_CUR]] to i32*
   double _Complex c = __builtin_va_arg(ap, double _Complex);
   // WIN64: %[[AP_CUR2:.*]] = load i8*, i8** %[[AP]]
-  // WIN64-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR2]], i64 16
+  // WIN64-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR2]], i64 8
   // WIN64-NEXT: store i8* %[[AP_NEXT2]], i8** %[[AP]]
   // WIN64-NEXT: bitcast i8* %[[AP_CUR2]] to { double, double }*
   struct foo d = __builtin_va_arg(ap, struct foo);
   // WIN64: %[[AP_CUR3:.*]] = load i8*, i8** %[[AP]]
-  // WIN64-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR3]], i64 16
+  // WIN64-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR3]], i64 8
   // WIN64-NEXT: store i8* %[[AP_NEXT3]], i8** %[[AP]]
   // WIN64-NEXT: bitcast i8* %[[AP_CUR3]] to %[[STRUCT_FOO]]*
   __builtin_va_list ap2;
@@ -124,7 +127,7 @@ void __attribute__((sysv_abi)) f6(__builtin_ms_va_list ap) {
   // FREEBSD-NEXT: store i8* %[[AP_NEXT2]], i8** %[[AP]]
   // FREEBSD-NEXT: bitcast i8* %[[AP_CUR2]] to { double, double }*
   // WIN64: %[[AP_CUR2:.*]] = load i8*, i8** %[[AP]]
-  // WIN64-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR2]], i64 16
+  // WIN64-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR2]], i64 8
   // WIN64-NEXT: store i8* %[[AP_NEXT2]], i8** %[[AP]]
   // WIN64-NEXT: bitcast i8* %[[AP_CUR2]] to { double, double }*
   struct foo d = __builtin_va_arg(ap, struct foo);
@@ -133,7 +136,7 @@ void __attribute__((sysv_abi)) f6(__builtin_ms_va_list ap) {
   // FREEBSD-NEXT: store i8* %[[AP_NEXT3]], i8** %[[AP]]
   // FREEBSD-NEXT: bitcast i8* %[[AP_CUR3]] to %[[STRUCT_FOO]]*
   // WIN64: %[[AP_CUR3:.*]] = load i8*, i8** %[[AP]]
-  // WIN64-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR3]], i64 16
+  // WIN64-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR3]], i64 8
   // WIN64-NEXT: store i8* %[[AP_NEXT3]], i8** %[[AP]]
   // WIN64-NEXT: bitcast i8* %[[AP_CUR3]] to %[[STRUCT_FOO]]*
   __builtin_ms_va_list ap2;
diff --git a/test/CodeGen/nobuiltin.c b/test/CodeGen/nobuiltin.c
index 7cc8164c6f12..f80c3c332ab4 100644
--- a/test/CodeGen/nobuiltin.c
+++ b/test/CodeGen/nobuiltin.c
@@ -1,17 +1,21 @@
-// RUN: %clang_cc1 -fno-builtin -O1 -S -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fno-builtin-memset -O1 -S -o - %s | FileCheck -check-prefix=MEMSET %s
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -O1 -S -o - %s | FileCheck -check-prefix=STRCPY -check-prefix=MEMSET %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fno-builtin -O1 -S -o - %s | FileCheck -check-prefix=NOSTRCPY -check-prefix=NOMEMSET %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fno-builtin-memset -O1 -S -o - %s | FileCheck -check-prefix=STRCPY -check-prefix=NOMEMSET %s
 
 void PR13497() {
   char content[2];
   // make sure we don't optimize this call to strcpy()
-  // CHECK: __strcpy_chk
+  // STRCPY-NOT: __strcpy_chk
+  // NOSTRCPY: __strcpy_chk
   __builtin___strcpy_chk(content, "", 1);
 }
 
 void PR4941(char *s) {
   // Make sure we don't optimize this loop to a memset().
-  // MEMSET-LABEL: PR4941:
-  // MEMSET-NOT: memset
+  // NOMEMSET-NOT: memset
+  // MEMSET: memset
   for (unsigned i = 0; i < 8192; ++i)
     s[i] = 0;
 }
diff --git a/test/CodeGen/noexceptionsfpmath.c b/test/CodeGen/noexceptionsfpmath.c
new file mode 100644
index 000000000000..a22e285bb72a
--- /dev/null
+++ b/test/CodeGen/noexceptionsfpmath.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -S -fno-trapping-math %s -emit-llvm -o - | FileCheck %s
+
+// CHECK-LABEL: main
+// CHECK: attributes #0 = {{.*}}"no-trapping-math"="true"{{.*}}
+
+int main() {
+  return 0;
+}
diff --git a/test/CodeGen/object-size.c b/test/CodeGen/object-size.c
index 6aee57375a48..a3f3bce92956 100644
--- a/test/CodeGen/object-size.c
+++ b/test/CodeGen/object-size.c
@@ -276,7 +276,7 @@ void test23(struct Test23Ty *p) {
 
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
   gi = __builtin_object_size(&p->t[5], 0);
-  // CHECK: store i32 20
+  // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
   gi = __builtin_object_size(&p->t[5], 1);
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 true)
   gi = __builtin_object_size(&p->t[5], 2);
@@ -444,7 +444,7 @@ void test29(struct DynStructVar *dv, struct DynStruct0 *d0,
 
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
   gi = __builtin_object_size(ss->snd, 0);
-  // CHECK: store i32 2
+  // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
   gi = __builtin_object_size(ss->snd, 1);
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 true)
   gi = __builtin_object_size(ss->snd, 2);
@@ -505,7 +505,7 @@ void test31() {
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
   gi = __builtin_object_size(ds1[9].snd, 1);
 
-  // CHECK: store i32 2
+  // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
   gi = __builtin_object_size(&ss[9].snd[0], 1);
 
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
@@ -517,3 +517,22 @@ void test31() {
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
   gi = __builtin_object_size(&dsv[9].snd[0], 1);
 }
+
+// CHECK-LABEL: @PR30346
+void PR30346() {
+  struct sa_family_t {};
+  struct sockaddr {
+    struct sa_family_t sa_family;
+    char sa_data[14];
+  };
+
+  struct sockaddr *sa;
+  // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
+  gi = __builtin_object_size(sa->sa_data, 0);
+  // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
+  gi = __builtin_object_size(sa->sa_data, 1);
+  // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 true)
+  gi = __builtin_object_size(sa->sa_data, 2);
+  // CHECK: store i32 14
+  gi = __builtin_object_size(sa->sa_data, 3);
+}
diff --git a/test/CodeGen/opt-record.c b/test/CodeGen/opt-record.c
new file mode 100644
index 000000000000..3bc3c41f7840
--- /dev/null
+++ b/test/CodeGen/opt-record.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -O3 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 %s -o %t -dwarf-column-info -opt-record-file %t.yaml -emit-obj
+// RUN: cat %t.yaml | FileCheck %s
+// RUN: llvm-profdata merge %S/Inputs/opt-record.proftext -o %t.profdata
+// RUN: %clang_cc1 -O3 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -fprofile-instrument-use-path=%t.profdata %s -o %t -dwarf-column-info -opt-record-file %t.yaml -emit-obj
+// RUN: cat %t.yaml | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PGO %s
+// REQUIRES: x86-registered-target
+
+void bar();
+void foo() { bar(); }
+
+void Test(int *res, int *c, int *d, int *p, int n) {
+  int i;
+
+#pragma clang loop vectorize(assume_safety)
+  for (i = 0; i < 1600; i++) {
+    res[i] = (p[i] == 0) ? res[i] : res[i] + d[i];
+  }
+}
+
+// CHECK: --- !Missed
+// CHECK: Pass:            inline
+// CHECK: Name:            NoDefinition
+// CHECK: DebugLoc:
+// CHECK: Function:        foo
+// CHECK-PGO: Hotness:
+
+// CHECK: --- !Passed
+// CHECK: Pass:            loop-vectorize
+// CHECK: Name:            Vectorized
+// CHECK: DebugLoc:
+// CHECK: Function:        Test
+// CHECK-PGO: Hotness:
+
diff --git a/test/CodeGen/overloadable.c b/test/CodeGen/overloadable.c
index 634820cfe743..1feb12f5b579 100644
--- a/test/CodeGen/overloadable.c
+++ b/test/CodeGen/overloadable.c
@@ -59,3 +59,38 @@ void foo() {
   // CHECK: @_Z13addrof_singlePc
   void *vp3 = &addrof_single;
 }
+
+
+void ovl_bar(char *) __attribute__((overloadable));
+void ovl_bar(int) __attribute__((overloadable));
+
+// CHECK-LABEL: define void @bar
+void bar() {
+  char charbuf[1];
+  unsigned char ucharbuf[1];
+
+  // CHECK: call void @_Z7ovl_barPc
+  ovl_bar(charbuf);
+  // CHECK: call void @_Z7ovl_barPc
+  ovl_bar(ucharbuf);
+}
+
+void ovl_baz(int *, int) __attribute__((overloadable));
+void ovl_baz(unsigned int *, unsigned int) __attribute__((overloadable));
+void ovl_baz2(int, int *) __attribute__((overloadable));
+void ovl_baz2(unsigned int, unsigned int *) __attribute__((overloadable));
+// CHECK-LABEL: define void @baz
+void baz() {
+  unsigned int j;
+  // Initial rules for incompatible pointer conversions made this overload
+  // ambiguous.
+  // CHECK: call void @_Z7ovl_bazPjj
+  ovl_baz(&j, 0);
+  // CHECK: call void @_Z7ovl_bazPjj
+  ovl_baz(&j, 0u);
+
+  // CHECK: call void @_Z8ovl_baz2jPj
+  ovl_baz2(0, &j);
+  // CHECK: call void @_Z8ovl_baz2jPj
+  ovl_baz2(0u, &j);
+}
diff --git a/test/CodeGen/pass-object-size.c b/test/CodeGen/pass-object-size.c
index 6e2bc2090eda..6f5827befa71 100644
--- a/test/CodeGen/pass-object-size.c
+++ b/test/CodeGen/pass-object-size.c
@@ -59,7 +59,8 @@ void test1() {
 
 // CHECK-LABEL: define void @test2
 void test2(struct Foo *t) {
-  // CHECK: call i32 @ObjectSize1(i8* %{{.*}}, i64 36)
+  // CHECK: [[VAR:%[0-9]+]] = call i64 @llvm.objectsize
+  // CHECK: call i32 @ObjectSize1(i8* %{{.*}}, i64 [[VAR]])
   gi = ObjectSize1(&t->t[1]);
   // CHECK: call i32 @ObjectSize3(i8* %{{.*}}, i64 36)
   gi = ObjectSize3(&t->t[1]);
@@ -168,7 +169,8 @@ void test4(struct Foo *t) {
 
   // CHECK: call i32 @_Z27NoViableOverloadObjectSize0PvU17pass_object_size0(i8* %{{.*}}, i64 %{{.*}})
   gi = NoViableOverloadObjectSize0(&t[1].t[1]);
-  // CHECK: call i32 @_Z27NoViableOverloadObjectSize1PvU17pass_object_size1(i8* %{{.*}}, i64 36)
+  // CHECK: [[VAR:%[0-9]+]] = call i64 @llvm.objectsize
+  // CHECK: call i32 @_Z27NoViableOverloadObjectSize1PvU17pass_object_size1(i8* %{{.*}}, i64 [[VAR]])
   gi = NoViableOverloadObjectSize1(&t[1].t[1]);
   // CHECK: call i32 @_Z27NoViableOverloadObjectSize2PvU17pass_object_size2(i8* %{{.*}}, i64 %{{.*}})
   gi = NoViableOverloadObjectSize2(&t[1].t[1]);
@@ -274,7 +276,8 @@ void test7() {
 
 // CHECK-LABEL: define void @test8
 void test8(struct Foo *t) {
-  // CHECK: call i32 @"\01Identity"(i8* %{{.*}}, i64 36)
+  // CHECK: [[VAR:%[0-9]+]] = call i64 @llvm.objectsize
+  // CHECK: call i32 @"\01Identity"(i8* %{{.*}}, i64 [[VAR]])
   gi = AsmObjectSize1(&t[1].t[1]);
   // CHECK: call i32 @"\01Identity"(i8* %{{.*}}, i64 36)
   gi = AsmObjectSize3(&t[1].t[1]);
diff --git a/test/CodeGen/pclmul-builtins.c b/test/CodeGen/pclmul-builtins.c
index ebca89903747..44300f645a9d 100644
--- a/test/CodeGen/pclmul-builtins.c
+++ b/test/CodeGen/pclmul-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <wmmintrin.h>
 
diff --git a/test/CodeGen/pgo-sample.c b/test/CodeGen/pgo-sample.c
index c955edfab7a4..e7d2fa61aa1e 100644
--- a/test/CodeGen/pgo-sample.c
+++ b/test/CodeGen/pgo-sample.c
@@ -2,8 +2,5 @@
 //
 // Ensure Pass PGOInstrumentationGenPass is invoked.
 // RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s
-// CHECK: Simplify the CFG
-// CHECK: SROA
-// CHECK: Combine redundant instructions
 // CHECK: Remove unused exception handling info
 // CHECK: Sample profile pass
diff --git a/test/CodeGen/pku.c b/test/CodeGen/pku.c
index 30565a83d6b3..cb02b03ffe0e 100644
--- a/test/CodeGen/pku.c
+++ b/test/CodeGen/pku.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +pku -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pku -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/popcnt-builtins.c b/test/CodeGen/popcnt-builtins.c
index 5ae40c7a7688..656a20f98605 100644
--- a/test/CodeGen/popcnt-builtins.c
+++ b/test/CodeGen/popcnt-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +popcnt -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +popcnt -emit-llvm -o - | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/ppc64-complex-parms.c b/test/CodeGen/ppc64-complex-parms.c
index 3f2a0c21420f..32163fa4a995 100644
--- a/test/CodeGen/ppc64-complex-parms.c
+++ b/test/CodeGen/ppc64-complex-parms.c
@@ -180,4 +180,4 @@ void bar_long_long(void) {
 // CHECK: %[[VAR77:[A-Za-z0-9.]+]] = load i64, i64* %[[VAR76]], align 8
 // CHECK: %{{[A-Za-z0-9.]+}} = call i64 @foo_long_long(i64 %[[VAR75]], i64 %[[VAR77]])
 
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/ppc64-complex-return.c b/test/CodeGen/ppc64-complex-return.c
index cdb0ed6e47a6..02bfe82d4efe 100644
--- a/test/CodeGen/ppc64-complex-return.c
+++ b/test/CodeGen/ppc64-complex-return.c
@@ -126,4 +126,4 @@ long long bar_long_long(void) {
 // CHECK: extractvalue { i64, i64 } [[VAR8]], 1
 
 
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/ppc64-dwarf.c b/test/CodeGen/ppc64-dwarf.c
new file mode 100644
index 000000000000..679bded453a8
--- /dev/null
+++ b/test/CodeGen/ppc64-dwarf.c
@@ -0,0 +1,129 @@
+// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+static unsigned char dwarf_reg_size_table[1024];
+
+int test() {
+  __builtin_init_dwarf_reg_size_table(dwarf_reg_size_table);
+
+  return __builtin_dwarf_sp_column();
+}
+
+// CHECK-LABEL: define signext i32 @test()
+// CHECK:      store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 0), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 1), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 2), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 3), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 4), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 5), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 6), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 7), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 8), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 9), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 10), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 11), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 12), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 13), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 14), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 15), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 16), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 17), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 18), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 19), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 20), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 21), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 22), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 23), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 24), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 25), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 26), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 27), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 28), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 29), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 30), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 31), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 32), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 33), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 34), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 35), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 36), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 37), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 38), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 39), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 40), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 41), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 42), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 43), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 44), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 45), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 46), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 47), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 48), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 49), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 50), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 51), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 52), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 53), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 54), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 55), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 56), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 57), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 58), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 59), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 60), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 61), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 62), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 63), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 64), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 65), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 66), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 67), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 68), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 69), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 70), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 71), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 72), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 73), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 74), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 75), align 1
+// CHECK-NEXT: store i8 4, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 76), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 77), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 78), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 79), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 80), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 81), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 82), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 83), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 84), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 85), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 86), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 87), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 88), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 89), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 90), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 91), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 92), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 93), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 94), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 95), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 96), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 97), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 98), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 99), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 100), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 101), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 102), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 103), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 104), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 105), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 106), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 107), align 1
+// CHECK-NEXT: store i8 16, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 108), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 109), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 110), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 111), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 112), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 113), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 114), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 115), align 1
+// CHECK-NEXT: store i8 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @dwarf_reg_size_table, i32 0, i32 116), align 1
+// CHECK-NEXT: ret i32 1
+
diff --git a/test/CodeGen/ppc64-extend.c b/test/CodeGen/ppc64-extend.c
index 52e5f136dd95..6b596b47d374 100644
--- a/test/CodeGen/ppc64-extend.c
+++ b/test/CodeGen/ppc64-extend.c
@@ -13,4 +13,4 @@ int f3(void) { return 0; }
 unsigned int f4(void) { return 0; }
 // CHECK: define zeroext i32 @f4() [[NUW]]
 
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/ppc64-soft-float.c b/test/CodeGen/ppc64-soft-float.c
new file mode 100644
index 000000000000..95b18292319d
--- /dev/null
+++ b/test/CodeGen/ppc64-soft-float.c
@@ -0,0 +1,171 @@
+// RUN: %clang_cc1 -msoft-float -mfloat-abi soft -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-LE %s
+// RUN: %clang_cc1 -msoft-float -mfloat-abi soft -triple powerpc64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-BE %s
+
+// Test float returns and params.
+
+// CHECK: define float @func_p1(float %x)
+float func_p1(float x) { return x; }
+
+// CHECK: define double @func_p2(double %x)
+double func_p2(double x) { return x; }
+
+// CHECK: define ppc_fp128 @func_p3(ppc_fp128 %x)
+long double func_p3(long double x) { return x; }
+
+// Test homogeneous float aggregate passing and returning.
+
+struct f1 { float f[1]; };
+struct f2 { float f[2]; };
+struct f3 { float f[3]; };
+struct f4 { float f[4]; };
+struct f5 { float f[5]; };
+struct f6 { float f[6]; };
+struct f7 { float f[7]; };
+struct f8 { float f[8]; };
+struct f9 { float f[9]; };
+
+struct fab { float a; float b; };
+struct fabc { float a; float b; float c; };
+
+struct f2a2b { float a[2]; float b[2]; };
+
+// CHECK-LE: define i32 @func_f1(float inreg %x.coerce)
+// CHECK-BE: define void @func_f1(%struct.f1* noalias sret %agg.result, float inreg %x.coerce)
+struct f1 func_f1(struct f1 x) { return x; }
+
+// CHECK-LE: define i64 @func_f2(i64 %x.coerce)
+// CHECK-BE: define void @func_f2(%struct.f2* noalias sret %agg.result, i64 %x.coerce)
+struct f2 func_f2(struct f2 x) { return x; }
+
+// CHECK-LE: define { i64, i64 } @func_f3([2 x i64] %x.coerce)
+// CHECK-BE: define void @func_f3(%struct.f3* noalias sret %agg.result, [2 x i64] %x.coerce)
+struct f3 func_f3(struct f3 x) { return x; }
+
+// CHECK-LE: define { i64, i64 } @func_f4([2 x i64] %x.coerce)
+// CHECK-BE: define void @func_f4(%struct.f4* noalias sret %agg.result, [2 x i64] %x.coerce)
+struct f4 func_f4(struct f4 x) { return x; }
+
+// CHECK: define void @func_f5(%struct.f5* noalias sret %agg.result, [3 x i64] %x.coerce)
+struct f5 func_f5(struct f5 x) { return x; }
+
+// CHECK: define void @func_f6(%struct.f6* noalias sret %agg.result, [3 x i64] %x.coerce)
+struct f6 func_f6(struct f6 x) { return x; }
+
+// CHECK: define void @func_f7(%struct.f7* noalias sret %agg.result, [4 x i64] %x.coerce)
+struct f7 func_f7(struct f7 x) { return x; }
+
+// CHECK: define void @func_f8(%struct.f8* noalias sret %agg.result, [4 x i64] %x.coerce)
+struct f8 func_f8(struct f8 x) { return x; }
+
+// CHECK: define void @func_f9(%struct.f9* noalias sret %agg.result, [5 x i64] %x.coerce)
+struct f9 func_f9(struct f9 x) { return x; }
+
+// CHECK-LE: define i64 @func_fab(i64 %x.coerce)
+// CHECK-BE: define void @func_fab(%struct.fab* noalias sret %agg.result, i64 %x.coerce)
+struct fab func_fab(struct fab x) { return x; }
+
+// CHECK-LE: define { i64, i64 } @func_fabc([2 x i64] %x.coerce)
+// CHECK-BE: define void @func_fabc(%struct.fabc* noalias sret %agg.result, [2 x i64] %x.coerce)
+struct fabc func_fabc(struct fabc x) { return x; }
+
+// CHECK-LE: define { i64, i64 } @func_f2a2b([2 x i64] %x.coerce)
+// CHECK-BE: define void @func_f2a2b(%struct.f2a2b* noalias sret %agg.result, [2 x i64] %x.coerce)
+struct f2a2b func_f2a2b(struct f2a2b x) { return x; }
+
+// CHECK-LABEL: @call_f1
+// CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f1, align 4
+// CHECK: %[[TMP:[^ ]+]] = load float, float* getelementptr inbounds (%struct.f1, %struct.f1* @global_f1, i32 0, i32 0, i32 0), align 4
+// CHECK-LE: call i32 @func_f1(float inreg %[[TMP]])
+// CHECK-BE: call void @func_f1(%struct.f1* sret %[[TMP0]], float inreg %[[TMP]])
+struct f1 global_f1;
+void call_f1(void) { global_f1 = func_f1(global_f1); }
+
+// CHECK-LABEL: @call_f2
+// CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f2, align 4
+// CHECK: %[[TMP:[^ ]+]] = load i64, i64* bitcast (%struct.f2* @global_f2 to i64*), align 4
+// CHECK-LE: call i64 @func_f2(i64 %[[TMP]])
+// CHECK-BE: call void @func_f2(%struct.f2* sret %[[TMP0]], i64 %[[TMP]])
+struct f2 global_f2;
+void call_f2(void) { global_f2 = func_f2(global_f2); }
+
+// CHECK-LABEL: @call_f3
+// CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f3, align 4
+// CHECK: %[[TMP1:[^ ]+]] = alloca [2 x i64]
+// CHECK: %[[TMP2:[^ ]+]] = bitcast [2 x i64]* %[[TMP1]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f3* @global_f3 to i8*), i64 12, i32 4, i1 false)
+// CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP1]]
+// CHECK-LE: call { i64, i64 } @func_f3([2 x i64] %[[TMP3]])
+// CHECK-BE: call void @func_f3(%struct.f3* sret %[[TMP0]], [2 x i64] %[[TMP3]])
+struct f3 global_f3;
+void call_f3(void) { global_f3 = func_f3(global_f3); }
+
+// CHECK-LABEL: @call_f4
+// CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f4, align 4
+// CHECK: %[[TMP:[^ ]+]] = load [2 x i64], [2 x i64]* bitcast (%struct.f4* @global_f4 to [2 x i64]*), align 4
+// CHECK-LE: call { i64, i64 } @func_f4([2 x i64] %[[TMP]])
+// CHECK-BE: call void @func_f4(%struct.f4* sret %[[TMP0]], [2 x i64] %[[TMP]])
+struct f4 global_f4;
+void call_f4(void) { global_f4 = func_f4(global_f4); }
+
+// CHECK-LABEL: @call_f5
+// CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f5, align 4
+// CHECK: %[[TMP1:[^ ]+]] = alloca [3 x i64]
+// CHECK: %[[TMP2:[^ ]+]] = bitcast [3 x i64]* %[[TMP1]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f5* @global_f5 to i8*), i64 20, i32 4, i1 false)
+// CHECK: %[[TMP3:[^ ]+]] = load [3 x i64], [3 x i64]* %[[TMP1]]
+// CHECK: call void @func_f5(%struct.f5* sret %[[TMP0]], [3 x i64] %[[TMP3]])
+struct f5 global_f5;
+void call_f5(void) { global_f5 = func_f5(global_f5); }
+
+// CHECK-LABEL: @call_f6
+// CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f6, align 4
+// CHECK: %[[TMP:[^ ]+]] = load [3 x i64], [3 x i64]* bitcast (%struct.f6* @global_f6 to [3 x i64]*), align 4
+// CHECK: call void @func_f6(%struct.f6* sret %[[TMP0]], [3 x i64] %[[TMP]])
+struct f6 global_f6;
+void call_f6(void) { global_f6 = func_f6(global_f6); }
+
+// CHECK-LABEL: @call_f7
+// CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f7, align 4
+// CHECK: %[[TMP1:[^ ]+]] = alloca [4 x i64], align 8
+// CHECK: %[[TMP2:[^ ]+]] = bitcast [4 x i64]* %[[TMP1]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f7* @global_f7 to i8*), i64 28, i32 4, i1 false)
+// CHECK: %[[TMP3:[^ ]+]] = load [4 x i64], [4 x i64]* %[[TMP1]], align 8
+// CHECK: call void @func_f7(%struct.f7* sret %[[TMP0]], [4 x i64] %[[TMP3]])
+struct f7 global_f7;
+void call_f7(void) { global_f7 = func_f7(global_f7); }
+
+// CHECK-LABEL: @call_f8
+// CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f8, align 4
+// CHECK: %[[TMP:[^ ]+]] = load [4 x i64], [4 x i64]* bitcast (%struct.f8* @global_f8 to [4 x i64]*), align 4
+// CHECK: call void @func_f8(%struct.f8* sret %[[TMP0]], [4 x i64] %[[TMP]])
+struct f8 global_f8;
+void call_f8(void) { global_f8 = func_f8(global_f8); }
+
+// CHECK-LABEL: @call_f9
+// CHECK: %[[TMP1:[^ ]+]] = alloca [5 x i64]
+// CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 4, i1 false)
+// CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]]
+// CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]])
+struct f9 global_f9;
+void call_f9(void) { global_f9 = func_f9(global_f9); }
+
+// CHECK-LABEL: @call_fab
+// CHECK: %[[TMP0:[^ ]+]] = alloca %struct.fab, align 4
+// CHECK: %[[TMP:[^ ]+]] = load i64, i64* bitcast (%struct.fab* @global_fab to i64*), align 4
+// CHECK-LE: %call = call i64 @func_fab(i64 %[[TMP]])
+// CHECK-BE: call void @func_fab(%struct.fab* sret %[[TMP0]], i64 %[[TMP]])
+struct fab global_fab;
+void call_fab(void) { global_fab = func_fab(global_fab); }
+
+// CHECK-LABEL: @call_fabc
+// CHECK-BE: %[[TMPX:[^ ]+]] = alloca %struct.fabc, align 4
+// CHECK: %[[TMP0:[^ ]+]] = alloca [2 x i64], align 8
+// CHECK: %[[TMP2:[^ ]+]] = bitcast [2 x i64]* %[[TMP0]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.fabc* @global_fabc to i8*), i64 12, i32 4, i1 false)
+// CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP0]], align 8
+// CHECK-LE: %call = call { i64, i64 } @func_fabc([2 x i64] %[[TMP3]])
+// CHECK-BE: call void @func_fabc(%struct.fabc* sret %[[TMPX]], [2 x i64] %[[TMP3]])
+struct fabc global_fabc;
+void call_fabc(void) { global_fabc = func_fabc(global_fabc); }
+
diff --git a/test/CodeGen/ppc64-struct-onevect.c b/test/CodeGen/ppc64-struct-onevect.c
index 34f4cc0753ac..8cd4126912a5 100644
--- a/test/CodeGen/ppc64-struct-onevect.c
+++ b/test/CodeGen/ppc64-struct-onevect.c
@@ -9,5 +9,5 @@ v4sf foo (struct s a) {
   return a.v;
 }
 
-// CHECK-LABEL: define <4 x float> @foo(<4 x float> inreg %a.coerce)
+// CHECK-LABEL: define <4 x float> @foo(<4 x float> inreg returned %a.coerce)
 // CHECK: ret <4 x float> %a.coerce
diff --git a/test/CodeGen/pr27892.c b/test/CodeGen/pr27892.c
index 694ce9eb0a52..57722c4671ab 100644
--- a/test/CodeGen/pr27892.c
+++ b/test/CodeGen/pr27892.c
@@ -7,7 +7,7 @@ long test1(long *p) {
 // CHECK:   %[[p_addr:.*]] = alloca i64*, align 8
 // CHECK:   store i64* %p, i64** %[[p_addr]], align 8
 // CHECK:   %[[p_load:.*]] = load i64*, i64** %[[p_addr]], align 8
-// CHECK:   %[[atomic_add:.*]] = atomicrmw volatile add i64* %[[p_load]], i64 1 seq_cst
+// CHECK:   %[[atomic_add:.*]] = atomicrmw add i64* %[[p_load]], i64 1 seq_cst
 // CHECK:   %[[res:.*]] = add i64 %[[atomic_add]], 1
 // CHECK:   ret i64 %[[res]]
 
@@ -18,6 +18,6 @@ long test2(long *p) {
 // CHECK:   %[[p_addr:.*]] = alloca i64*, align 8
 // CHECK:   store i64* %p, i64** %[[p_addr]], align 8
 // CHECK:   %[[p_load:.*]] = load i64*, i64** %[[p_addr]], align 8
-// CHECK:   %[[atomic_sub:.*]] = atomicrmw volatile sub i64* %[[p_load]], i64 1 seq_cst
+// CHECK:   %[[atomic_sub:.*]] = atomicrmw sub i64* %[[p_load]], i64 1 seq_cst
 // CHECK:   %[[res:.*]] = sub i64 %[[atomic_sub]], 1
 // CHECK:   ret i64 %[[res]]
diff --git a/test/CodeGen/prefetchw-builtins.c b/test/CodeGen/prefetchw-builtins.c
index 8a50325ea18f..53416de46ff4 100644
--- a/test/CodeGen/prefetchw-builtins.c
+++ b/test/CodeGen/prefetchw-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +prfchw -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -target-feature +prfchw -emit-llvm -o - %s | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/rd-builtins.c b/test/CodeGen/rd-builtins.c
index 5cad90390941..0d7cd04d6f26 100644
--- a/test/CodeGen/rd-builtins.c
+++ b/test/CodeGen/rd-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/rdrand-builtins.c b/test/CodeGen/rdrand-builtins.c
index 15414a334580..936502b77483 100644
--- a/test/CodeGen/rdrand-builtins.c
+++ b/test/CodeGen/rdrand-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - %s | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/regcall.c b/test/CodeGen/regcall.c
new file mode 100644
index 000000000000..350a82d33963
--- /dev/null
+++ b/test/CodeGen/regcall.c
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -ffreestanding -triple=i386-pc-win32       | FileCheck %s --check-prefix=Win32
+// RUN: %clang_cc1 -emit-llvm %s -o - -ffreestanding -triple=x86_64-pc-win32     | FileCheck %s --check-prefix=Win64
+// RUN: %clang_cc1 -emit-llvm %s -o - -ffreestanding -triple=i386-pc-linux-gnu   | FileCheck %s --check-prefix=Lin32
+// RUN: %clang_cc1 -emit-llvm %s -o - -ffreestanding -triple=x86_64-pc-linux-gnu | FileCheck %s --check-prefix=Lin64
+
+#include <xmmintrin.h>
+
+void __regcall v1(int a, int b) {}
+// Win32: define x86_regcallcc void @__regcall3__v1(i32 inreg %a, i32 inreg %b)
+// Win64: define x86_regcallcc void @__regcall3__v1(i32 %a, i32 %b)
+// Lin32: define x86_regcallcc void @__regcall3__v1(i32 inreg %a, i32 inreg %b)
+// Lin64: define x86_regcallcc void @__regcall3__v1(i32 %a, i32 %b)
+
+void __attribute__((regcall)) v1b(int a, int b) {}
+// Win32: define x86_regcallcc void @__regcall3__v1b(i32 inreg %a, i32 inreg %b)
+// Win64: define x86_regcallcc void @__regcall3__v1b(i32 %a, i32 %b)
+// Lin32: define x86_regcallcc void @__regcall3__v1b(i32 inreg %a, i32 inreg %b)
+// Lin64: define x86_regcallcc void @__regcall3__v1b(i32 %a, i32 %b)
+
+void __regcall v2(char a, char b) {}
+// Win32: define x86_regcallcc void @__regcall3__v2(i8 inreg signext %a, i8 inreg signext %b)
+// Win64: define x86_regcallcc void @__regcall3__v2(i8 %a, i8 %b)
+// Lin32: define x86_regcallcc void @__regcall3__v2(i8 inreg signext %a, i8 inreg signext %b)
+// Lin64: define x86_regcallcc void @__regcall3__v2(i8 signext %a, i8 signext %b)
+
+struct Small { int x; };
+void __regcall v3(int a, struct Small b, int c) {}
+// Win32: define x86_regcallcc void @__regcall3__v3(i32 inreg %a, i32 %b.0, i32 inreg %c)
+// Win64: define x86_regcallcc void @__regcall3__v3(i32 %a, i32 %b.coerce, i32 %c)
+// Lin32: define x86_regcallcc void @__regcall3__v3(i32 inreg %a, i32 inreg, i32 %b.0, i32 inreg %c)
+// Lin64: define x86_regcallcc void @__regcall3__v3(i32 %a, i32 %b.coerce, i32 %c)
+
+struct Large { int a[5]; };
+void __regcall v4(int a, struct Large b, int c) {}
+// Win32: define x86_regcallcc void @__regcall3__v4(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
+// Win64: define x86_regcallcc void @__regcall3__v4(i32 %a, %struct.Large* %b, i32 %c)
+// Lin32: define x86_regcallcc void @__regcall3__v4(i32 inreg %a, %struct.Large* byval align 4 %b, i32 %c)
+// Lin64: define x86_regcallcc void @__regcall3__v4(i32 %a, [5 x i32] %b.coerce, i32 %c)
+
+struct HFA2 { double x, y; };
+struct HFA4 { double w, x, y, z; };
+struct HFA5 { double v, w, x, y, z; };
+
+void __regcall hfa1(int a, struct HFA4 b, int c) {}
+// Win32: define x86_regcallcc void @__regcall3__hfa1(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
+// Win64: define x86_regcallcc void @__regcall3__hfa1(i32 %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 %c)
+// Lin32: define x86_regcallcc void @__regcall3__hfa1(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
+// Lin64: define x86_regcallcc void @__regcall3__hfa1(i32 %a, double %b.coerce0, double %b.coerce1, double %b.coerce2, double %b.coerce3, i32 %c)
+
+// HFAs that would require more than six total SSE registers are passed
+// indirectly. Additional vector arguments can consume the rest of the SSE
+// registers.
+void __regcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
+// Win32: define x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, double* inreg)
+// Win64: define x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, double %c)
+// Lin32: define x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, double* inreg)
+// Lin64: define x86_regcallcc void @__regcall3__hfa2(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %b.coerce0, double %b.coerce1, double %b.coerce2, double %b.coerce3, double %c)
+
+// Ensure that we pass builtin types directly while counting them against the
+// SSE register usage.
+void __regcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
+// Win32: define x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.0, double %f.1)
+// Win64: define x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.0, double %f.1)
+// Lin32: define x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.0, double %f.1)
+// Lin64: define x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.coerce0, double %f.coerce1)
+
+// Aggregates with more than four elements are not HFAs and are passed byval.
+// Because they are not classified as homogeneous, they don't get special
+// handling to ensure alignment.
+void __regcall hfa4(struct HFA5 a) {}
+// Win32: define x86_regcallcc void @__regcall3__hfa4(%struct.HFA5* byval align 4)
+// Win64: define x86_regcallcc void @__regcall3__hfa4(%struct.HFA5* %a)
+// Lin32: define x86_regcallcc void @__regcall3__hfa4(%struct.HFA5* byval align 4 %a)
+// Lin64: define x86_regcallcc void @__regcall3__hfa4(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %a.coerce4)
+
+// Return HFAs of 4 or fewer elements in registers.
+static struct HFA2 g_hfa2;
+struct HFA2 __regcall hfa5(void) { return g_hfa2; }
+// Win32: define x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
+// Win64: define x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
+// Lin32: define x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
+// Lin64: define x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
+
+typedef float __attribute__((vector_size(16))) v4f32;
+struct HVA2 { v4f32 x, y; };
+struct HVA4 { v4f32 w, x, y, z; };
+
+void __regcall hva1(int a, struct HVA4 b, int c) {}
+// Win32: define x86_regcallcc void @__regcall3__hva1(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
+// Win64: define x86_regcallcc void @__regcall3__hva1(i32 %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 %c)
+// Lin32: define x86_regcallcc void @__regcall3__hva1(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
+// Lin64: define x86_regcallcc void @__regcall3__hva1(i32 %a, <4 x float> %b.coerce0, <4 x float> %b.coerce1, <4 x float> %b.coerce2, <4 x float> %b.coerce3, i32 %c)
+
+void __regcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
+// Win32: define x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float>* inreg)
+// Win64: define x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c)
+// Lin32: define x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float>* inreg)
+// Lin64: define x86_regcallcc void @__regcall3__hva2(<4 x float> %a.coerce0, <4 x float> %a.coerce1, <4 x float> %a.coerce2, <4 x float> %a.coerce3, <4 x float> %b.coerce0, <4 x float> %b.coerce1, <4 x float> %b.coerce2, <4 x float> %b.coerce3, <4 x float> %c)
+
+void __regcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
+// Win32: define x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.0, <4 x float> %f.1)
+// Win64: define x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.0, <4 x float> %f.1)
+// Lin32: define x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.0, <4 x float> %f.1)
+// Lin64: define x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.coerce0, <4 x float> %f.coerce1)
+
+typedef float __attribute__((ext_vector_type(3))) v3f32;
+struct OddSizeHVA { v3f32 x, y; };
+
+void __regcall odd_size_hva(struct OddSizeHVA a) {}
+// Win32: define x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
+// Win64: define x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
+// Lin32: define x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
+// Lin64: define x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.coerce0, <3 x float> %a.coerce1)
+
+struct HFA6 { __m128 f[4]; };
+struct HFA6 __regcall ret_reg_reused(struct HFA6 a, struct HFA6 b, struct HFA6 c, struct HFA6 d){ struct HFA6 h; return h;}
+// Win32: define x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, %struct.HFA6* inreg %c, %struct.HFA6* inreg %d)
+// Win64: define x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c.0, <4 x float> %c.1, <4 x float> %c.2, <4 x float> %c.3, <4 x float> %d.0, <4 x float> %d.1, <4 x float> %d.2, <4 x float> %d.3)
+// Lin32: define x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, %struct.HFA6* inreg %c, %struct.HFA6* inreg %d)
+// Lin64: define x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused([4 x <4 x float>] %a.coerce, [4 x <4 x float>] %b.coerce, [4 x <4 x float>] %c.coerce, [4 x <4 x float>] %d.coerce)
diff --git a/test/CodeGen/renderscript.c b/test/CodeGen/renderscript.c
index d47750a7736e..5482d36f583d 100644
--- a/test/CodeGen/renderscript.c
+++ b/test/CodeGen/renderscript.c
@@ -23,3 +23,118 @@ _Static_assert(_Alignof(long) == LONG_WIDTH_AND_ALIGN, "sizeof long is wrong");
 long test_long(long v) {
   return v + 1;
 }
+
+// =============================================================================
+// Test coercion of aggregate argument or return value into integer arrays
+// =============================================================================
+
+// =============================================================================
+// aggregate parameter <= 4 bytes: coerced to [a x iNN] for both 32-bit and
+// 64-bit RenderScript
+// ==============================================================================
+
+typedef struct {char c1, c2, c3; } sChar3;
+typedef struct {short s; char c;} sShortChar;
+
+// CHECK-RS32: void @argChar3([3 x i8] %s.coerce)
+// CHECK-RS64: void @argChar3([3 x i8] %s.coerce)
+void argChar3(sChar3 s) {}
+
+// CHECK-RS32: void @argShortChar([2 x i16] %s.coerce)
+// CHECK-RS64: void @argShortChar([2 x i16] %s.coerce)
+void argShortChar(sShortChar s) {}
+
+// =============================================================================
+// aggregate return value <= 4 bytes: coerced to [a x iNN] for both 32-bit and
+// 64-bit RenderScript
+// =============================================================================
+
+// CHECK-RS32: [3 x i8] @retChar3()
+// CHECK-RS64: [3 x i8] @retChar3()
+sChar3 retChar3() { sChar3 r; return r; }
+
+// CHECK-RS32: [2 x i16] @retShortChar()
+// CHECK-RS64: [2 x i16] @retShortChar()
+sShortChar retShortChar() { sShortChar r; return r; }
+
+// =============================================================================
+// aggregate parameter <= 16 bytes: coerced to [a x iNN] for both 32-bit and
+// 64-bit RenderScript
+// =============================================================================
+
+typedef struct {short s1; char c; short s2; } sShortCharShort;
+typedef struct {int i; short s; char c; } sIntShortChar;
+typedef struct {long l; int i; } sLongInt;
+
+// CHECK-RS32: void @argShortCharShort([3 x i16] %s.coerce)
+// CHECK-RS64: void @argShortCharShort([3 x i16] %s.coerce)
+void argShortCharShort(sShortCharShort s) {}
+
+// CHECK-RS32: void @argIntShortChar([2 x i32] %s.coerce)
+// CHECK-RS64: void @argIntShortChar([2 x i32] %s.coerce)
+void argIntShortChar(sIntShortChar s) {}
+
+// CHECK-RS32: void @argLongInt([2 x i64] %s.coerce)
+// CHECK-RS64: void @argLongInt([2 x i64] %s.coerce)
+void argLongInt(sLongInt s) {}
+
+// =============================================================================
+// aggregate return value <= 16 bytes: returned on stack for 32-bit RenderScript
+// and coerced to [a x iNN] for 64-bit RenderScript
+// =============================================================================
+
+// CHECK-RS32: void @retShortCharShort(%struct.sShortCharShort* noalias sret %agg.result)
+// CHECK-RS64: [3 x i16] @retShortCharShort()
+sShortCharShort retShortCharShort() { sShortCharShort r; return r; }
+
+// CHECK-RS32: void @retIntShortChar(%struct.sIntShortChar* noalias sret %agg.result)
+// CHECK-RS64: [2 x i32] @retIntShortChar()
+sIntShortChar retIntShortChar() { sIntShortChar r; return r; }
+
+// CHECK-RS32: void @retLongInt(%struct.sLongInt* noalias sret %agg.result)
+// CHECK-RS64: [2 x i64] @retLongInt()
+sLongInt retLongInt() { sLongInt r; return r; }
+
+// =============================================================================
+// aggregate parameter <= 64 bytes: coerced to [a x iNN] for 32-bit RenderScript
+// and passed on the stack for 64-bit RenderScript
+// =============================================================================
+
+typedef struct {int i1, i2, i3, i4, i5; } sInt5;
+typedef struct {long l1, l2; char c; } sLong2Char;
+
+// CHECK-RS32: void @argInt5([5 x i32] %s.coerce)
+// CHECK-RS64: void @argInt5(%struct.sInt5* %s)
+void argInt5(sInt5 s) {}
+
+// CHECK-RS32: void @argLong2Char([3 x i64] %s.coerce)
+// CHECK-RS64: void @argLong2Char(%struct.sLong2Char* %s)
+void argLong2Char(sLong2Char s) {}
+
+// =============================================================================
+// aggregate return value <= 64 bytes: returned on stack for both 32-bit and
+// 64-bit RenderScript
+// =============================================================================
+
+// CHECK-RS32: void @retInt5(%struct.sInt5* noalias sret %agg.result)
+// CHECK-RS64: void @retInt5(%struct.sInt5* noalias sret %agg.result)
+sInt5 retInt5() { sInt5 r; return r;}
+
+// CHECK-RS32: void @retLong2Char(%struct.sLong2Char* noalias sret %agg.result)
+// CHECK-RS64: void @retLong2Char(%struct.sLong2Char* noalias sret %agg.result)
+sLong2Char retLong2Char() { sLong2Char r; return r;}
+
+// =============================================================================
+// aggregate parameters and return values > 64 bytes: passed and returned on the
+// stack for both 32-bit and 64-bit RenderScript
+// =============================================================================
+
+typedef struct {long l1, l2, l3, l4, l5, l6, l7, l8, l9; } sLong9;
+
+// CHECK-RS32: void @argLong9(%struct.sLong9* byval align 8 %s)
+// CHECK-RS64: void @argLong9(%struct.sLong9* %s)
+void argLong9(sLong9 s) {}
+
+// CHECK-RS32: void @retLong9(%struct.sLong9* noalias sret %agg.result)
+// CHECK-RS64: void @retLong9(%struct.sLong9* noalias sret %agg.result)
+sLong9 retLong9() { sLong9 r; return r; }
diff --git a/test/CodeGen/rtm-builtins.c b/test/CodeGen/rtm-builtins.c
index 5cf3237d83e8..44952a07cd77 100644
--- a/test/CodeGen/rtm-builtins.c
+++ b/test/CodeGen/rtm-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +rtm -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +rtm -emit-llvm -o - | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/sanitize-init-order.cpp b/test/CodeGen/sanitize-init-order.cpp
index ee53f7b0771e..894f75e96739 100644
--- a/test/CodeGen/sanitize-init-order.cpp
+++ b/test/CodeGen/sanitize-init-order.cpp
@@ -1,12 +1,11 @@
 // RUN: %clang_cc1 -fsanitize=address -emit-llvm -o - %s | FileCheck %s
 
 // Test blacklist functionality.
-// RUN: echo "src:%s=init" > %t-file.blacklist
+// RUN: echo "src:%s=init" | sed -e 's/\\/\\\\/g' > %t-file.blacklist
 // RUN: echo "type:PODWithCtorAndDtor=init" > %t-type.blacklist
 // RUN: echo "type:NS::PODWithCtor=init" >> %t-type.blacklist
 // RUN: %clang_cc1 -fsanitize=address -fsanitize-blacklist=%t-file.blacklist -emit-llvm -o - %s | FileCheck %s --check-prefix=BLACKLIST
 // RUN: %clang_cc1 -fsanitize=address -fsanitize-blacklist=%t-type.blacklist -emit-llvm -o - %s | FileCheck %s --check-prefix=BLACKLIST
-// REQUIRES: shell
 
 struct PODStruct {
   int x;
diff --git a/test/CodeGen/sanitize-thread-attr.cpp b/test/CodeGen/sanitize-thread-attr.cpp
index 46cab4dbf92f..4e0e28e5e304 100644
--- a/test/CodeGen/sanitize-thread-attr.cpp
+++ b/test/CodeGen/sanitize-thread-attr.cpp
@@ -1,10 +1,8 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck -check-prefix=WITHOUT %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s -fsanitize=thread | FileCheck -check-prefix=TSAN %s
-// RUN: echo "src:%s" > %t
+// RUN: echo "src:%s" | sed -e 's/\\/\\\\/g' > %t
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s -fsanitize=thread -fsanitize-blacklist=%t | FileCheck -check-prefix=BL %s
 
-// REQUIRES: shell
-
 // The sanitize_thread attribute should be attached to functions
 // when ThreadSanitizer is enabled, unless no_sanitize_thread attribute
 // is present.
@@ -56,9 +54,9 @@ int global2 = *(int*)((char*)&global1+1);
 // BL: @__cxx_global_var_init{{.*}}[[NOATTR:#[0-9]+]]
 // TSAN: @__cxx_global_var_init{{.*}}[[WITH:#[0-9]+]]
 
-// WITHOUT: attributes [[NOATTR]] = { nounwind{{.*}} }
+// WITHOUT: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
 
-// BL: attributes [[NOATTR]] = { nounwind{{.*}} }
+// BL: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
 
-// TSAN: attributes [[NOATTR]] = { nounwind{{.*}} }
-// TSAN: attributes [[WITH]] = { nounwind sanitize_thread{{.*}} }
+// TSAN: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
+// TSAN: attributes [[WITH]] = { noinline nounwind sanitize_thread{{.*}} }
diff --git a/test/CodeGen/sanitize-thread-no-checking-at-run-time.m b/test/CodeGen/sanitize-thread-no-checking-at-run-time.m
new file mode 100644
index 000000000000..098b7cf72ffd
--- /dev/null
+++ b/test/CodeGen/sanitize-thread-no-checking-at-run-time.m
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -x objective-c++ -emit-llvm -o - %s | FileCheck -check-prefix=WITHOUT %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -x objective-c++ -emit-llvm -o - %s -fsanitize=thread | FileCheck -check-prefix=TSAN %s
+
+__attribute__((objc_root_class))
+@interface NSObject
+- (void)dealloc;
+@end
+
+class NeedCleanup {
+public:
+  ~NeedCleanup() __attribute__((no_sanitize("thread"))) {}
+};
+
+@interface MyObject : NSObject {
+  NeedCleanup v;
+};
++ (void) initialize;
+- (void) dealloc;
+@end
+
+@implementation MyObject
++ (void)initialize {
+}
+- (void)dealloc {
+  [super dealloc];
+}
+@end
+
+// WITHOUT-NOT: "sanitize_thread_no_checking_at_run_time"
+
+// TSAN: initialize{{.*}}) [[ATTR:#[0-9]+]]
+// TSAN: dealloc{{.*}}) [[ATTR:#[0-9]+]]
+// TSAN: cxx_destruct{{.*}}) [[ATTR:#[0-9]+]]
+// TSAN: attributes [[ATTR]] = { noinline nounwind {{.*}} "sanitize_thread_no_checking_at_run_time" {{.*}} }
diff --git a/test/CodeGen/sha-builtins.c b/test/CodeGen/sha-builtins.c
index 9c14a1ea7f0a..ede1a6bf7b1f 100644
--- a/test/CodeGen/sha-builtins.c
+++ b/test/CodeGen/sha-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-unknown-unknown -target-feature +sha -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +sha -emit-llvm -o - | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/split-debug-filename.c b/test/CodeGen/split-debug-filename.c
index 63970a83df10..43daeada96ab 100644
--- a/test/CodeGen/split-debug-filename.c
+++ b/test/CodeGen/split-debug-filename.c
@@ -1,7 +1,7 @@
-// RUN: %clang -target x86_64-linux-gnu -gsplit-dwarf -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -split-dwarf-file foo.dwo -S -emit-llvm -o - %s | FileCheck %s
 int main (void) {
   return 0;
 }
 
 // Testing to ensure that the dwo name gets output into the compile unit.
-// CHECK: split-debug-filename.dwo
+// CHECK: !DICompileUnit({{.*}}, splitDebugFilename: "foo.dwo"
diff --git a/test/CodeGen/split-debug-inlining.c b/test/CodeGen/split-debug-inlining.c
new file mode 100644
index 000000000000..4a306d43e952
--- /dev/null
+++ b/test/CodeGen/split-debug-inlining.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -debug-info-kind=limited -fno-split-dwarf-inlining -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck --check-prefix=ABSENT %s
+void f(void) {}
+// Verify that disabling split debug inlining info is propagated to the debug
+// info metadata.
+// CHECK: !DICompileUnit({{.*}}, splitDebugInlining: false
+// ABSENT-NOT: splitDebugInlining
diff --git a/test/CodeGen/sse-builtins.c b/test/CodeGen/sse-builtins.c
index 6f313b825c9a..27b016f66517 100644
--- a/test/CodeGen/sse-builtins.c
+++ b/test/CodeGen/sse-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/sse.c b/test/CodeGen/sse.c
index 1e8c5dbe5dec..1191f55f81e7 100644
--- a/test/CodeGen/sse.c
+++ b/test/CodeGen/sse.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 -O3 -triple x86_64-apple-macosx10.8.0 -target-feature +sse4.1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -O3 -triple x86_64-apple-macosx10.8.0 -target-feature +sse4.1 -emit-llvm %s -o - | FileCheck %s
 // FIXME: This test currently depends on optimization - it should be rewritten to avoid it.
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <emmintrin.h>
 
diff --git a/test/CodeGen/sse2-builtins.c b/test/CodeGen/sse2-builtins.c
index ee9dca7d4451..48c703685479 100644
--- a/test/CodeGen/sse2-builtins.c
+++ b/test/CodeGen/sse2-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
@@ -73,7 +71,7 @@ __m128i test_mm_adds_epu16(__m128i A, __m128i B) {
 
 __m128d test_mm_and_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_and_pd
-  // CHECK: and <4 x i32>
+  // CHECK: and <2 x i64>
   return _mm_and_pd(A, B);
 }
 
@@ -85,8 +83,8 @@ __m128i test_mm_and_si128(__m128i A, __m128i B) {
 
 __m128d test_mm_andnot_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_andnot_pd
-  // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
-  // CHECK: and <4 x i32>
+  // CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  // CHECK: and <2 x i64>
   return _mm_andnot_pd(A, B);
 }
 
@@ -845,7 +843,7 @@ __m128i test_mm_mullo_epi16(__m128i A, __m128i B) {
 
 __m128d test_mm_or_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_or_pd
-  // CHECK: or <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   return _mm_or_pd(A, B);
 }
 
@@ -1529,7 +1527,7 @@ __m128d test_mm_unpacklo_pd(__m128d A, __m128d B) {
 
 __m128d test_mm_xor_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_xor_pd
-  // CHECK: xor <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   return _mm_xor_pd(A, B);
 }
 
diff --git a/test/CodeGen/sse3-builtins.c b/test/CodeGen/sse3-builtins.c
index b046c431fd50..46a7bbbb91e2 100644
--- a/test/CodeGen/sse3-builtins.c
+++ b/test/CodeGen/sse3-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse3 -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/sse41-builtins.c b/test/CodeGen/sse41-builtins.c
index ad24ecd5ed1b..adf9609b68f9 100644
--- a/test/CodeGen/sse41-builtins.c
+++ b/test/CodeGen/sse41-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/sse42-builtins.c b/test/CodeGen/sse42-builtins.c
index 00f7ff93afe2..523db1a3416e 100644
--- a/test/CodeGen/sse42-builtins.c
+++ b/test/CodeGen/sse42-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/sse4a-builtins.c b/test/CodeGen/sse4a-builtins.c
index 0604423fe17e..3d36a7dfa007 100644
--- a/test/CodeGen/sse4a-builtins.c
+++ b/test/CodeGen/sse4a-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4a -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4a -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/ssse3-builtins.c b/test/CodeGen/ssse3-builtins.c
index 673387c066b2..b2279e277cd2 100644
--- a/test/CodeGen/ssse3-builtins.c
+++ b/test/CodeGen/ssse3-builtins.c
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/systemz-abi-vector.c b/test/CodeGen/systemz-abi-vector.c
index 1d1f302e6324..455e3bbcced2 100644
--- a/test/CodeGen/systemz-abi-vector.c
+++ b/test/CodeGen/systemz-abi-vector.c
@@ -4,6 +4,8 @@
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 // RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu z13 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu arch11 \
+// RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 
 // Vector types
 
diff --git a/test/CodeGen/systemz-abi.c b/test/CodeGen/systemz-abi.c
index 375d02a35889..17fdb9e322c8 100644
--- a/test/CodeGen/systemz-abi.c
+++ b/test/CodeGen/systemz-abi.c
@@ -4,6 +4,8 @@
 // RUN:   -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu z13 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu arch11 \
+// RUN:   -emit-llvm -o - %s | FileCheck %s
 
 // Scalar types
 
diff --git a/test/CodeGen/target-data.c b/test/CodeGen/target-data.c
index c12ad2c1c977..1e8ce6a2fd12 100644
--- a/test/CodeGen/target-data.c
+++ b/test/CodeGen/target-data.c
@@ -169,6 +169,8 @@
 
 // RUN: %clang_cc1 -triple s390x-unknown -target-cpu z13 -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SYSTEMZ-VECTOR
+// RUN: %clang_cc1 -triple s390x-unknown -target-cpu arch11 -o - -emit-llvm %s | \
+// RUN: FileCheck %s -check-prefix=SYSTEMZ-VECTOR
 // SYSTEMZ-VECTOR: target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 
 // RUN: %clang_cc1 -triple msp430-unknown -o - -emit-llvm %s | \
@@ -177,7 +179,11 @@
 
 // RUN: %clang_cc1 -triple tce-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=TCE
-// TCE: target datalayout = "E-p:32:32-i8:8:32-i16:16:32-i64:32-f64:32-v64:32-v128:32-a:0:32-n32"
+// TCE: target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:32-v128:32:32-v256:32:32-v512:32:32-v1024:32:32-a0:0:32-n32"
+
+// RUN: %clang_cc1 -triple tcele-unknown -o - -emit-llvm %s | \
+// RUN: FileCheck %s -check-prefix=TCELE
+// TCELE: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:32-v128:32:32-v256:32:32-v512:32:32-v1024:32:32-a0:0:32-n32"
 
 // RUN: %clang_cc1 -triple spir-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SPIR
diff --git a/test/CodeGen/tbaa-class.cpp b/test/CodeGen/tbaa-class.cpp
index 7172e05d9e34..9b614a19e8b0 100644
--- a/test/CodeGen/tbaa-class.cpp
+++ b/test/CodeGen/tbaa-class.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=PATH
 // Test TBAA metadata generated by front-end.
 
 typedef unsigned char uint8_t;
diff --git a/test/CodeGen/tbaa-ms-abi.cpp b/test/CodeGen/tbaa-ms-abi.cpp
index 878e1b610b59..51100b18c3d4 100644
--- a/test/CodeGen/tbaa-ms-abi.cpp
+++ b/test/CodeGen/tbaa-ms-abi.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i686-pc-win32 -disable-llvm-optzns -emit-llvm -o - -O1 %s | FileCheck %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -disable-llvm-passes -emit-llvm -o - -O1 %s | FileCheck %s
 //
 // Test that TBAA works in the Microsoft C++ ABI.  We used to error out while
 // attempting to mangle RTTI.
diff --git a/test/CodeGen/tbaa.cpp b/test/CodeGen/tbaa.cpp
index 432c41e10793..10e64875476c 100644
--- a/test/CodeGen/tbaa.cpp
+++ b/test/CodeGen/tbaa.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=PATH
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
 // Test TBAA metadata generated by front-end.
 //
 // NO-TBAA-NOT: !tbaa
diff --git a/test/CodeGen/tbm-builtins.c b/test/CodeGen/tbm-builtins.c
index c8a9382ed17d..8e031408a4b6 100644
--- a/test/CodeGen/tbm-builtins.c
+++ b/test/CodeGen/tbm-builtins.c
@@ -1,10 +1,8 @@
-// RUN: %clang_cc1 %s -O3 -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
 // FIXME: The code generation checks for add/sub and/or are depending on the optimizer.
 // The REQUIRES keyword will be removed when the FIXME is complete.
 // REQUIRES: x86-registered-target
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/temporary-lifetime-exceptions.cpp b/test/CodeGen/temporary-lifetime-exceptions.cpp
index 17e21683f22c..f435560c9759 100644
--- a/test/CodeGen/temporary-lifetime-exceptions.cpp
+++ b/test/CodeGen/temporary-lifetime-exceptions.cpp
@@ -9,16 +9,16 @@ A Baz(const A&);
 void Test1() {
   // CHECK-LABEL: @_Z5Test1v(
   // CHECK: getelementptr
-  // CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* [[TMP:[^ ]+]])
+  // CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* nonnull [[TMP:[^ ]+]])
   // CHECK-NEXT: getelementptr
-  // CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* [[TMP1:[^ ]+]])
+  // CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* nonnull [[TMP1:[^ ]+]])
 
   // Normal exit
-  // CHECK: call void @llvm.lifetime.end(i64 1, i8* [[TMP1]])
-  // CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* [[TMP]])
+  // CHECK: call void @llvm.lifetime.end(i64 1, i8* nonnull [[TMP1]])
+  // CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* nonnull [[TMP]])
 
   // Exception exit
-  // CHECK: call void @llvm.lifetime.end(i64 1, i8* [[TMP1]])
-  // CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* [[TMP]])
+  // CHECK: call void @llvm.lifetime.end(i64 1, i8* nonnull [[TMP1]])
+  // CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* nonnull [[TMP]])
   Baz(Baz(A()));
 }
diff --git a/test/CodeGen/temporary-lifetime.cpp b/test/CodeGen/temporary-lifetime.cpp
index f6dd3e0b2a53..f105a441a367 100644
--- a/test/CodeGen/temporary-lifetime.cpp
+++ b/test/CodeGen/temporary-lifetime.cpp
@@ -21,27 +21,27 @@ T Baz();
 
 void Test1() {
   // CHECK-DTOR-LABEL: Test1
-  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR:[0-9]+]])
   // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
   // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
   // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR]])
-  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
-  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR:[0-9]+]])
   // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
   // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
   // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR]])
-  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR]])
   // CHECK-DTOR: }
 
   // CHECK-NO-DTOR-LABEL: Test1
-  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR:[0-9]+]])
   // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
   // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
-  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
-  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR:[0-9]+]])
   // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
   // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
-  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR]])
   // CHECK-NO-DTOR: }
   {
     const A &a = A{};
@@ -55,27 +55,27 @@ void Test1() {
 
 void Test2() {
   // CHECK-DTOR-LABEL: Test2
-  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR1:[0-9]+]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR1:[0-9]+]])
   // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR1:[^ ]+]])
   // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
-  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR2:[0-9]+]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR2:[0-9]+]])
   // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR2:[^ ]+]])
   // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
   // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR2]])
-  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR2]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR2]])
   // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR1]])
-  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR1]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR1]])
   // CHECK-DTOR: }
 
   // CHECK-NO-DTOR-LABEL: Test2
-  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR1:[0-9]+]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR1:[0-9]+]])
   // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR1:[^ ]+]])
   // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
-  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR2:[0-9]+]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR2:[0-9]+]])
   // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR2:[^ ]+]])
   // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
-  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR2]])
-  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR1]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR2]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR1]])
   // CHECK-NO-DTOR: }
   const A &a = A{};
   Foo(a);
@@ -135,16 +135,16 @@ int Test5() {
 
 void Test6() {
   // CHECK-DTOR-LABEL: Test6
-  // CHECK-DTOR: call void @llvm.lifetime.start(i64 {{[0-9]+}}, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 {{[0-9]+}}, i8* nonnull %[[ADDR:[0-9]+]])
   // CHECK-DTOR: call i32 @_Z3BazIiET_v()
   // CHECK-DTOR: store
   // CHECK-DTOR: call void @_Z3FooIiEvOT_
-  // CHECK-DTOR: call void @llvm.lifetime.end(i64 {{[0-9]+}}, i8* %[[ADDR]])
-  // CHECK-DTOR: call void @llvm.lifetime.start(i64 {{[0-9]+}}, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 {{[0-9]+}}, i8* nonnull %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 {{[0-9]+}}, i8* nonnull %[[ADDR:[0-9]+]])
   // CHECK-DTOR: call i32 @_Z3BazIiET_v()
   // CHECK-DTOR: store
   // CHECK-DTOR: call void @_Z3FooIiEvOT_
-  // CHECK-DTOR: call void @llvm.lifetime.end(i64 {{[0-9]+}}, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 {{[0-9]+}}, i8* nonnull %[[ADDR]])
   // CHECK-DTOR: }
   Foo(Baz<int>());
   Foo(Baz<int>());
@@ -152,16 +152,16 @@ void Test6() {
 
 void Test7() {
   // CHECK-DTOR-LABEL: Test7
-  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR:[0-9]+]])
   // CHECK-DTOR: call void @_Z3BazI1AET_v({{.*}} %[[SLOT:[^ ]+]])
   // CHECK-DTOR: call void @_Z3FooI1AEvOT_({{.*}} %[[SLOT]])
   // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[SLOT]])
-  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
-  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* nonnull %[[ADDR:[0-9]+]])
   // CHECK-DTOR: call void @_Z3BazI1AET_v({{.*}} %[[SLOT:[^ ]+]])
   // CHECK-DTOR: call void @_Z3FooI1AEvOT_({{.*}} %[[SLOT]])
   // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[SLOT]])
-  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* nonnull %[[ADDR]])
   // CHECK-DTOR: }
   Foo(Baz<A>());
   Foo(Baz<A>());
diff --git a/test/CodeGen/thinlto_backend.ll b/test/CodeGen/thinlto_backend.ll
index 0fb2643e037d..89f4fc407fbc 100644
--- a/test/CodeGen/thinlto_backend.ll
+++ b/test/CodeGen/thinlto_backend.ll
@@ -9,8 +9,8 @@
 ; CHECK-WARNING: error: invalid argument '-fthinlto-index={{.*}}' only allowed with '-x ir'
 
 ; Ensure we get expected error for missing index file
-; RUN: %clang -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=bad.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR
-; CHECK-ERROR: Error loading index file 'bad.thinlto.bc'
+; RUN: %clang -O2 -o %t4.o -x ir %t1.o -c -fthinlto-index=bad.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR1
+; CHECK-ERROR1: Error loading index file 'bad.thinlto.bc'
 
 ; Ensure f2 was imported
 ; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc
@@ -18,6 +18,11 @@
 ; CHECK-OBJ: T f1
 ; CHECK-OBJ-NOT: U f2
 
+; Ensure we get expected error for input files without summaries
+; RUN: opt -o %t2.o %s
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR2
+; CHECK-ERROR2: Error loading imported file '{{.*}}': Could not find module summary
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/ubsan-blacklist.c b/test/CodeGen/ubsan-blacklist.c
index f6e38822bff8..666003bd9233 100644
--- a/test/CodeGen/ubsan-blacklist.c
+++ b/test/CodeGen/ubsan-blacklist.c
@@ -1,13 +1,10 @@
 // Verify ubsan doesn't emit checks for blacklisted functions and files
 // RUN: echo "fun:hash" > %t-func.blacklist
-// RUN: echo "src:%s" > %t-file.blacklist
+// RUN: echo "src:%s" | sed -e 's/\\/\\\\/g' > %t-file.blacklist
 // RUN: %clang_cc1 -fsanitize=unsigned-integer-overflow -emit-llvm %s -o - | FileCheck %s --check-prefix=DEFAULT
 // RUN: %clang_cc1 -fsanitize=unsigned-integer-overflow -fsanitize-blacklist=%t-func.blacklist -emit-llvm %s -o - | FileCheck %s --check-prefix=FUNC
 // RUN: %clang_cc1 -fsanitize=unsigned-integer-overflow -fsanitize-blacklist=%t-file.blacklist -emit-llvm %s -o - | FileCheck %s --check-prefix=FILE
 
-// FIXME: %t-file.blacklist contains DOSish paths.
-// REQUIRES: shell
-
 unsigned i;
 
 // DEFAULT: @hash
diff --git a/test/CodeGen/unwind-attr.c b/test/CodeGen/unwind-attr.c
index 527237578ee4..2065653e0de4 100644
--- a/test/CodeGen/unwind-attr.c
+++ b/test/CodeGen/unwind-attr.c
@@ -23,7 +23,7 @@ __attribute__((weak)) int test2(void) {
   return 0;
 }
 
-// CHECK: attributes [[TF]] = { "{{.*}} }
-// CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[TF]] = { noinline "{{.*}} }
+// CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
 
-// CHECK-NOEXC: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK-NOEXC: attributes [[NUW]] = { noinline nounwind{{.*}} }
diff --git a/test/CodeGen/vecshift.c b/test/CodeGen/vecshift.c
new file mode 100644
index 000000000000..1fa047cd30b9
--- /dev/null
+++ b/test/CodeGen/vecshift.c
@@ -0,0 +1,162 @@
+// RUN: %clang_cc1  -Wno-error-vec-elem-size -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1  -Wno-error-vec-elem-size -DEXT -emit-llvm %s -o - | FileCheck %s
+
+#ifdef EXT
+typedef __attribute__((__ext_vector_type__(8))) char vector_char8;
+typedef __attribute__((__ext_vector_type__(8))) short vector_short8;
+typedef __attribute__((__ext_vector_type__(8))) int vector_int8;
+typedef __attribute__((__ext_vector_type__(8))) unsigned char vector_uchar8;
+typedef __attribute__((__ext_vector_type__(8))) unsigned short vector_ushort8;
+typedef __attribute__((__ext_vector_type__(8))) unsigned int vector_uint8;
+typedef __attribute__((__ext_vector_type__(4))) char vector_char4;
+typedef __attribute__((__ext_vector_type__(4))) short vector_short4;
+typedef __attribute__((__ext_vector_type__(4))) int vector_int4;
+typedef __attribute__((__ext_vector_type__(4))) unsigned char vector_uchar4;
+typedef __attribute__((__ext_vector_type__(4))) unsigned short vector_ushort4;
+typedef __attribute__((__ext_vector_type__(4))) unsigned int vector_uint4;
+#else
+typedef __attribute__((vector_size(8))) char vector_char8;
+typedef __attribute__((vector_size(16))) short vector_short8;
+typedef __attribute__((vector_size(32))) int vector_int8;
+typedef __attribute__((vector_size(8))) unsigned char vector_uchar8;
+typedef __attribute__((vector_size(16))) unsigned short vector_ushort8;
+typedef __attribute__((vector_size(32))) unsigned int vector_uint8;
+typedef __attribute__((vector_size(4))) char vector_char4;
+typedef __attribute__((vector_size(4))) short vector_short4;
+typedef __attribute__((vector_size(16))) int vector_int4;
+typedef __attribute__((vector_size(4))) unsigned char vector_uchar4;
+typedef __attribute__((vector_size(8))) unsigned short vector_ushort4;
+typedef __attribute__((vector_size(16))) unsigned int vector_uint4;
+#endif
+
+char c;
+short s;
+int i;
+unsigned char uc;
+unsigned short us;
+unsigned int ui;
+vector_char8 vc8;
+vector_short8 vs8;
+vector_int8 vi8;
+vector_uchar8 vuc8;
+vector_ushort8 vus8;
+vector_uint8 vui8;
+vector_char4 vc4;
+vector_short4 vs4;
+vector_int4 vi4;
+vector_uchar4 vuc4;
+vector_ushort4 vus4;
+vector_uint4 vui4;
+
+void foo() {
+  vc8 = 1 << vc8;
+// CHECK: [[t0:%.+]] = load <8 x i8>, <8 x i8>* {{@.+}},
+// CHECK: shl <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[t0]]
+  vuc8 = 1 << vuc8;
+// CHECK: [[t1:%.+]] = load <8 x i8>, <8 x i8>* {{@.+}},
+// CHECK: shl <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[t1]]
+  vi8 = 1 << vi8;
+// CHECK: [[t2:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: shl <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, [[t2]]
+  vui8 = 1 << vui8;
+// CHECK: [[t3:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: shl <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, [[t3]]
+  vs8 = 1 << vs8;
+// CHECK: [[t4:%.+]] = load <8 x i16>, <8 x i16>* {{@.+}},
+// CHECK: shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, [[t4]]
+  vus8 = 1 << vus8;
+// CHECK: [[t5:%.+]] = load <8 x i16>, <8 x i16>* {{@.+}},
+// CHECK: shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, [[t5]]
+
+  vc8 = c << vc8;
+// CHECK: [[t6:%.+]] = load i8, i8* @c,
+// CHECK: [[splat_splatinsert:%.+]] = insertelement <8 x i8> undef, i8 [[t6]], i32 0
+// CHECK: [[splat_splat:%.+]] = shufflevector <8 x i8> [[splat_splatinsert]], <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK: [[t7:%.+]] = load <8 x i8>, <8 x i8>* {{@.+}},
+// CHECK: shl <8 x i8> [[splat_splat]], [[t7]]
+  vuc8 = i << vuc8;
+// CHECK: [[t8:%.+]] = load i32, i32* @i,
+// CHECK: [[tconv:%.+]] = trunc i32 [[t8]] to i8
+// CHECK: [[splat_splatinsert7:%.+]] = insertelement <8 x i8> undef, i8 [[tconv]], i32 0
+// CHECK: [[splat_splat8:%.+]] = shufflevector <8 x i8> [[splat_splatinsert7]], <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK: [[t9:%.+]] = load <8 x i8>, <8 x i8>* {{@.+}},
+// CHECK: shl <8 x i8> [[splat_splat8]], [[t9]]
+  vi8 = uc << vi8;
+// CHECK: [[t10:%.+]] = load i8, i8* @uc,
+// CHECK: [[conv10:%.+]] = zext i8 [[t10]] to i32
+// CHECK: [[splat_splatinsert11:%.+]] = insertelement <8 x i32> undef, i32 [[conv10]], i32 0
+// CHECK: [[splat_splat12:%.+]] = shufflevector <8 x i32> [[splat_splatinsert11]], <8 x i32> undef, <8 x i32> zeroinitializer
+// CHECK: [[t11:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: shl <8 x i32> [[splat_splat12]], [[t11]]
+  vui8 = us << vui8;
+// CHECK: [[t12:%.+]] = load i16, i16* @us,
+// CHECK: [[conv14:%.+]] = zext i16 [[t12]] to i32
+// CHECK: [[splat_splatinsert15:%.+]] = insertelement <8 x i32> undef, i32 [[conv14]], i32 0
+// CHECK: [[splat_splat16:%.+]] = shufflevector <8 x i32> [[splat_splatinsert15]], <8 x i32> undef, <8 x i32> zeroinitializer
+// CHECK: [[t13:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: shl <8 x i32> [[splat_splat16]], [[t13]]
+  vs8 = ui << vs8;
+// CHECK: [[t14:%.+]] = load i32, i32* @ui,
+// CHECK: [[conv18:%.+]] = trunc i32 [[t14]] to i16
+// CHECK: [[splat_splatinsert19:%.+]] = insertelement <8 x i16> undef, i16 [[conv18]], i32 0
+// CHECK: [[splat_splat20:%.+]] = shufflevector <8 x i16> [[splat_splatinsert19]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK: [[t15:%.+]] = load <8 x i16>, <8 x i16>* {{@.+}},
+// CHECK: shl <8 x i16> [[splat_splat20]], [[t15]]
+  vus8 = 1 << vus8;
+// CHECK: [[t16:%.+]] = load <8 x i16>, <8 x i16>* {{@.+}},
+// CHECK: [[shl22:%.+]] = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, [[t16]]
+
+ vc8 = vc8 << vc8;
+// CHECK: [[t17:%.+]] = load <8 x i8>, <8 x i8>* {{@.+}},
+// CHECK: [[t18:%.+]] = load <8 x i8>, <8 x i8>* {{@.+}},
+// CHECK: shl <8 x i8> [[t17]], [[t18]]
+  vi8 = vi8 << vuc8;
+// CHECK: [[t19:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: [[t20:%.+]] = load <8 x i8>, <8 x i8>* {{@.+}},
+// CHECK: [[shprom:%.+]] = zext <8 x i8> [[t20]] to <8 x i32>
+// CHECK: shl <8 x i32> [[t19]], [[shprom]]
+  vuc8 = vuc8 << vi8;
+// CHECK: [[t21:%.+]] = load <8 x i8>, <8 x i8>* {{@.+}},
+// CHECK: [[t22:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: [[sh_prom25:%.+]] = trunc <8 x i32> [[t22]] to <8 x i8>
+// CHECK: shl <8 x i8> [[t21]], [[sh_prom25]]
+  vus8 = vus8 << vui8;
+// CHECK: [[t23:%.+]] = load <8 x i16>, <8 x i16>* {{@.+}},
+// CHECK: [[t24:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: [[sh_prom27:%.+]] = trunc <8 x i32> [[t24]] to <8 x i16>
+// CHECK: shl <8 x i16> [[t23]], [[sh_prom27]]
+  vui8 = vui8 << vs8;
+// CHECK: [[t25:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: [[t26:%.+]] = load <8 x i16>, <8 x i16>* {{@.+}},
+// CHECK: [[sh_prom29:%.+]] = zext <8 x i16> [[t26]] to <8 x i32>
+// CHECK: shl <8 x i32> [[t25]], [[sh_prom29]]
+
+  vui8 <<= s;
+// CHECK: [[t27:%.+]] = load i16, i16* @s,
+// CHECK: [[conv40:%.+]] = sext i16 [[t27]] to i32
+// CHECK: [[splat_splatinsert41:%.+]] = insertelement <8 x i32> undef, i32 [[conv40]], i32 0
+// CHECK: [[splat_splat42:%.+]] = shufflevector <8 x i32> [[splat_splatinsert41]], <8 x i32> undef, <8 x i32> zeroinitializer
+// CHECK: [[t28:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: shl <8 x i32> [[t28]], [[splat_splat42]]
+  vi8 <<= us;
+// CHECK: [[t29:%.+]] = load i16, i16* @us,
+// CHECK: [[conv44:%.+]] = zext i16 [[t29]] to i32
+// CHECK: [[splat_splatinsert45:%.+]] = insertelement <8 x i32> undef, i32 [[conv44]], i32 0
+// CHECK: [[splat_splat46:%.+]] = shufflevector <8 x i32> [[splat_splatinsert45]], <8 x i32> undef, <8 x i32> zeroinitializer
+// CHECK: [[t30:%.+]] = load <8 x i32>, <8 x i32>* {{@.+}},
+// CHECK: shl <8 x i32> [[t30]], [[splat_splat46]]
+  vus8 <<= i;
+// CHECK: [[t31:%.+]] = load i32, i32* @i,
+// CHECK: [[splat_splatinsert48:%.+]] = insertelement <8 x i32> undef, i32 [[t31]], i32 0
+// CHECK: [[splat_splat49:%.+]] = shufflevector <8 x i32> [[splat_splatinsert48]], <8 x i32> undef, <8 x i32> zeroinitializer
+// CHECK: [[t32:%.+]] = load <8 x i16>, <8 x i16>* {{@.+}},
+// CHECK: [[sh_prom50:%.+]] = trunc <8 x i32> [[splat_splat49]] to <8 x i16>
+// CHECK: shl <8 x i16> [[t32]], [[sh_prom50]]
+  vs8 <<= ui;
+// CHECK: [[t33:%.+]] = load i32, i32* @ui,
+// CHECK: [[splat_splatinsert52:%.+]] = insertelement <8 x i32> undef, i32 [[t33]], i32 0
+// CHECK: [[splat_splat53:%.+]] = shufflevector <8 x i32> [[splat_splatinsert52]], <8 x i32> undef, <8 x i32> zeroinitializer
+// CHECK: [[t34:%.+]] = load <8 x i16>, <8 x i16>* {{@.+}},
+// CHECK: [[sh_prom54:%.+]] = trunc <8 x i32> [[splat_splat53]] to <8 x i16>
+// CHECK: shl <8 x i16> [[t34]], [[sh_prom54]]
+}
diff --git a/test/CodeGen/vector.c b/test/CodeGen/vector.c
index 14f507972639..ebaea841aa85 100644
--- a/test/CodeGen/vector.c
+++ b/test/CodeGen/vector.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-apple-darwin9 -O1 -target-cpu core2 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple i386-apple-darwin9 -O1 -target-cpu core2 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
 typedef short __v4hi __attribute__ ((__vector_size__ (8)));
 
 void test1() {
@@ -20,8 +20,6 @@ void test3 ( vec4* a, char b, float c ) {
 
 
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <mmintrin.h>
 
diff --git a/test/CodeGen/windows-on-arm-itanium-thread-local.c b/test/CodeGen/windows-on-arm-itanium-thread-local.c
new file mode 100644
index 000000000000..7f12c367d91b
--- /dev/null
+++ b/test/CodeGen/windows-on-arm-itanium-thread-local.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple thumbv7--windows-itanium -fdeclspec -fms-compatibility -fms-compatibility-version=19.0 -S -emit-llvm -o - %s | FileCheck %s
+
+__declspec(thread) static void *c;
+void f(void *p) {
+  c = p;
+}
+
+// CHECK-LABEL: @f(i8* %p)
+// CHECK-NOT: call i8** @_ZTWL1c()
+// CHECK: call arm_aapcs_vfpcc i8** @_ZTWL1c()
+
diff --git a/test/CodeGen/windows-on-arm-stack-probe-size.c b/test/CodeGen/windows-on-arm-stack-probe-size.c
index c072566d7125..15bdacd6ee46 100644
--- a/test/CodeGen/windows-on-arm-stack-probe-size.c
+++ b/test/CodeGen/windows-on-arm-stack-probe-size.c
@@ -5,7 +5,7 @@
 // RUN:     | FileCheck %s -check-prefix CHECK-4096
 
 // RUN: %clang_cc1 -triple thumbv7-windows-itanium -fms-extensions -O2 -emit-llvm %s -o - \
-// RUN:     | FileCheck %s -check-prefix CHECK
+// RUN:     | FileCheck %s
 
 __declspec(dllimport) void initialise(signed char buffer[4096]);
 
diff --git a/test/CodeGen/windows-swiftcall.c b/test/CodeGen/windows-swiftcall.c
new file mode 100644
index 000000000000..f76b2fd3a40f
--- /dev/null
+++ b/test/CodeGen/windows-swiftcall.c
@@ -0,0 +1,458 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-windows -emit-llvm -target-cpu core2 -o - %s | FileCheck %s
+
+#define SWIFTCALL __attribute__((swiftcall))
+#define OUT __attribute__((swift_indirect_result))
+#define ERROR __attribute__((swift_error_result))
+#define CONTEXT __attribute__((swift_context))
+
+// CHECK: [[STRUCT2_RESULT:@.*]] = private {{.*}} constant [[STRUCT2_TYPE:%.*]] { i32 0, i8 0, i8 undef, i8 0, float 0.000000e+00, float 0.000000e+00 }
+
+/*****************************************************************************/
+/****************************** PARAMETER ABIS *******************************/
+/*****************************************************************************/
+
+SWIFTCALL void indirect_result_1(OUT int *arg0, OUT float *arg1) {}
+// CHECK-LABEL: define {{.*}} void @indirect_result_1(i32* noalias sret align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+// TODO: maybe this shouldn't suppress sret.
+SWIFTCALL int indirect_result_2(OUT int *arg0, OUT float *arg1) {  __builtin_unreachable(); }
+// CHECK-LABEL: define {{.*}} i32 @indirect_result_2(i32* noalias align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+typedef struct { char array[1024]; } struct_reallybig;
+SWIFTCALL struct_reallybig indirect_result_3(OUT int *arg0, OUT float *arg1) { __builtin_unreachable(); }
+// CHECK-LABEL: define {{.*}} void @indirect_result_3({{.*}}* noalias sret {{.*}}, i32* noalias align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+SWIFTCALL void context_1(CONTEXT void *self) {}
+// CHECK-LABEL: define {{.*}} void @context_1(i8* swiftself
+
+SWIFTCALL void context_2(void *arg0, CONTEXT void *self) {}
+// CHECK-LABEL: define {{.*}} void @context_2(i8*{{.*}}, i8* swiftself
+
+SWIFTCALL void context_error_1(CONTEXT int *self, ERROR float **error) {}
+// CHECK-LABEL: define {{.*}} void @context_error_1(i32* swiftself{{.*}}, float** swifterror)
+// CHECK:       [[TEMP:%.*]] = alloca float*, align 8
+// CHECK:       [[T0:%.*]] = load float*, float** [[ERRORARG:%.*]], align 8
+// CHECK:       store float* [[T0]], float** [[TEMP]], align 8
+// CHECK:       [[T0:%.*]] = load float*, float** [[TEMP]], align 8
+// CHECK:       store float* [[T0]], float** [[ERRORARG]], align 8
+void test_context_error_1() {
+  int x;
+  float *error;
+  context_error_1(&x, &error);
+}
+// CHECK-LABEL: define void @test_context_error_1()
+// CHECK:       [[X:%.*]] = alloca i32, align 4
+// CHECK:       [[ERROR:%.*]] = alloca float*, align 8
+// CHECK:       [[TEMP:%.*]] = alloca swifterror float*, align 8
+// CHECK:       [[T0:%.*]] = load float*, float** [[ERROR]], align 8
+// CHECK:       store float* [[T0]], float** [[TEMP]], align 8
+// CHECK:       call [[SWIFTCC:swiftcc]] void @context_error_1(i32* swiftself [[X]], float** swifterror [[TEMP]])
+// CHECK:       [[T0:%.*]] = load float*, float** [[TEMP]], align 8
+// CHECK:       store float* [[T0]], float** [[ERROR]], align 8
+
+SWIFTCALL void context_error_2(short s, CONTEXT int *self, ERROR float **error) {}
+// CHECK-LABEL: define {{.*}} void @context_error_2(i16{{.*}}, i32* swiftself{{.*}}, float** swifterror)
+
+/*****************************************************************************/
+/********************************** LOWERING *********************************/
+/*****************************************************************************/
+
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float8 __attribute__((ext_vector_type(8)));
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef double double4 __attribute__((ext_vector_type(4)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int5 __attribute__((ext_vector_type(5)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+
+#define TEST(TYPE)                       \
+  SWIFTCALL TYPE return_##TYPE(void) {   \
+    TYPE result = {};                    \
+    return result;                       \
+  }                                      \
+  SWIFTCALL void take_##TYPE(TYPE v) {   \
+  }                                      \
+  void test_##TYPE() {                   \
+    take_##TYPE(return_##TYPE());        \
+  }
+
+/*****************************************************************************/
+/*********************************** STRUCTS *********************************/
+/*****************************************************************************/
+
+typedef struct {
+} struct_empty;
+TEST(struct_empty);
+// CHECK-LABEL: define {{.*}} @return_struct_empty()
+// CHECK:   ret void
+// CHECK-LABEL: define {{.*}} @take_struct_empty()
+// CHECK:   ret void
+
+typedef struct {
+  int x;
+  char c0;
+  char c1;
+  float f0;
+  float f1;
+} struct_1;
+TEST(struct_1);
+// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_1() {{.*}}{
+// CHECK:   [[RET:%.*]] = alloca [[STRUCT1:%.*]], align 4
+// CHECK:   [[VAR:%.*]] = alloca [[STRUCT1]], align 4
+// CHECK:   call void @llvm.memset
+// CHECK:   call void @llvm.memcpy
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* %retval to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = load i64, i64* [[GEP1]], align 4
+// CHECK:   [[R0:%.*]] = insertvalue { i64, i64 } undef, i64 [[T0]], 0
+// CHECK:   [[R1:%.*]] = insertvalue { i64, i64 } [[R0]], i64 [[T1]], 1
+// CHECK:   ret { i64, i64 } [[R1]]
+// CHECK: }
+// CHECK-LABEL: define swiftcc void @take_struct_1(i64, i64) {{.*}}{
+// CHECK:   [[V:%.*]] = alloca [[STRUCT1:%.*]], align 4
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* [[V]] to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   store i64 %0, i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   store i64 %1, i64* [[GEP1]], align 4
+// CHECK:   ret void
+// CHECK: }
+// CHECK-LABEL: define void @test_struct_1() {{.*}}{
+// CHECK:   [[AGG:%.*]] = alloca [[STRUCT1:%.*]], align 4
+// CHECK:   [[RET:%.*]] = call swiftcc { i64, i64 } @return_struct_1()
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[E0:%.*]] = extractvalue { i64, i64 } [[RET]], 0
+// CHECK:   store i64 [[E0]], i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[E1:%.*]] = extractvalue { i64, i64 } [[RET]], 1
+// CHECK:   store i64 [[E1]], i64* [[GEP1]], align 4
+// CHECK:   [[CAST2:%.*]] = bitcast [[STRUCT1]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[GEP2:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST2]], i32 0, i32 0
+// CHECK:   [[V0:%.*]] = load i64, i64* [[GEP2]], align 4
+// CHECK:   [[GEP3:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST2]], i32 0, i32 1
+// CHECK:   [[V1:%.*]] = load i64, i64* [[GEP3]], align 4
+// CHECK:   call swiftcc void @take_struct_1(i64 [[V0]], i64 [[V1]])
+// CHECK:   ret void
+// CHECK: }
+
+typedef struct {
+  int x;
+  char c0;
+  __attribute__((aligned(2))) char c1;
+  float f0;
+  float f1;
+} struct_2;
+TEST(struct_2);
+// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_2() {{.*}}{
+// CHECK:   [[RET:%.*]] = alloca [[STRUCT2_TYPE]], align 4
+// CHECK:   [[VAR:%.*]] = alloca [[STRUCT2_TYPE]], align 4
+// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
+// CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTVAR]], {{.*}}[[STRUCT2_RESULT]]
+// CHECK:   [[CASTRET:%.*]] = bitcast {{.*}} [[RET]]
+// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
+// CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTRET]], {{.*}}[[CASTVAR]]
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT2_TYPE]]* [[RET]] to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = load i64, i64* [[GEP1]], align 4
+// CHECK:   [[R0:%.*]] = insertvalue { i64, i64 } undef, i64 [[T0]], 0
+// CHECK:   [[R1:%.*]] = insertvalue { i64, i64 } [[R0]], i64 [[T1]], 1
+// CHECK:   ret { i64, i64 } [[R1]]
+// CHECK: }
+// CHECK-LABEL: define swiftcc void @take_struct_2(i64, i64) {{.*}}{
+// CHECK:   [[V:%.*]] = alloca [[STRUCT:%.*]], align 4
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT]]* [[V]] to { i64, i64 }*
+// CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   store i64 %0, i64* [[GEP0]], align 4
+// CHECK:   [[GEP1:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   store i64 %1, i64* [[GEP1]], align 4
+// CHECK:   ret void
+// CHECK: }
+// CHECK-LABEL: define void @test_struct_2() {{.*}} {
+// CHECK:   [[TMP:%.*]] = alloca [[STRUCT2_TYPE]], align 4
+// CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_struct_2()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[STRUCT2_TYPE]]* [[TMP]] to { i64, i64 }*
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds {{.*}} [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T0:%.*]] = extractvalue { i64, i64 } [[CALL]], 0
+// CHECK:   store i64 [[T0]], i64* [[GEP]], align 4
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds {{.*}} [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T0:%.*]] = extractvalue { i64, i64 } [[CALL]], 1
+// CHECK:   store i64 [[T0]], i64* [[GEP]], align 4
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT2_TYPE]]* [[TMP]] to { i64, i64 }*
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[R0:%.*]] = load i64, i64* [[GEP]], align 4
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[R1:%.*]] = load i64, i64* [[GEP]], align 4
+// CHECK:   call swiftcc void @take_struct_2(i64 [[R0]], i64 [[R1]])
+// CHECK:   ret void
+// CHECK: }
+
+// There's no way to put a field randomly in the middle of an otherwise
+// empty storage unit in C, so that case has to be tested in C++, which
+// can use empty structs to introduce arbitrary padding.  (In C, they end up
+// with size 0 and so don't affect layout.)
+
+// Misaligned data rule.
+typedef struct {
+  char c0;
+  __attribute__((packed)) float f;
+} struct_misaligned_1;
+TEST(struct_misaligned_1)
+// CHECK-LABEL: define swiftcc i64 @return_struct_misaligned_1()
+// CHECK:  [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
+// CHECK:  [[RES:%.*]] = alloca [[STRUCT]], align 1
+// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
+// CHECK:  call void @llvm.memset{{.*}}(i8* [[CAST]], i8 0, i64 5
+// CHECK:  [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
+// CHECK:  [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]], i64 5
+// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
+// CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
+// CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 1
+// CHECK:  ret i64 [[R0]]
+// CHECK:}
+// CHECK-LABEL: define swiftcc void @take_struct_misaligned_1(i64) {{.*}}{
+// CHECK:   [[V:%.*]] = alloca [[STRUCT:%.*]], align 1
+// CHECK:   [[CAST:%.*]] = bitcast [[STRUCT]]* [[V]] to { i64 }*
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   store i64 %0, i64* [[GEP]], align 1
+// CHECK:   ret void
+// CHECK: }
+// CHECK: define void @test_struct_misaligned_1() {{.*}}{
+// CHECK:   [[AGG:%.*]] = alloca [[STRUCT:%.*]], align 1
+// CHECK:   [[CALL:%.*]] = call swiftcc i64 @return_struct_misaligned_1()
+// CHECK:   [[T0:%.*]] = bitcast [[STRUCT]]* [[AGG]] to { i64 }*
+// CHECK:   [[T1:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[T0]], i32 0, i32 0
+// CHECK:   store i64 [[CALL]], i64* [[T1]], align 1
+// CHECK:   [[T0:%.*]] = bitcast [[STRUCT]]* [[AGG]] to { i64 }*
+// CHECK:   [[T1:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[T0]], i32 0, i32 0
+// CHECK:   [[P:%.*]] = load i64, i64* [[T1]], align 1
+// CHECK:   call swiftcc void @take_struct_misaligned_1(i64 [[P]])
+// CHECK:   ret void
+// CHECK: }
+
+// Too many scalars.
+typedef struct {
+  long long x[5];
+} struct_big_1;
+TEST(struct_big_1)
+
+// CHECK-LABEL: define {{.*}} void @return_struct_big_1({{.*}} noalias sret
+
+// Should not be byval.
+// CHECK-LABEL: define {{.*}} void @take_struct_big_1({{.*}}*{{( %.*)?}})
+
+/*****************************************************************************/
+/********************************* TYPE MERGING ******************************/
+/*****************************************************************************/
+
+typedef union {
+  float f;
+  double d;
+} union_het_fp;
+TEST(union_het_fp)
+// CHECK-LABEL: define swiftcc i64 @return_union_het_fp()
+// CHECK:  [[RET:%.*]] = alloca [[UNION:%.*]], align 8
+// CHECK:  [[RES:%.*]] = alloca [[UNION]], align 8
+// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CAST]]
+// CHECK:  [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
+// CHECK:  [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]]
+// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
+// CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
+// CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 8
+// CHECK:  ret i64 [[R0]]
+// CHECK-LABEL: define swiftcc void @take_union_het_fp(i64) {{.*}}{
+// CHECK:   [[V:%.*]] = alloca [[UNION:%.*]], align 8
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[V]] to { i64 }*
+// CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   store i64 %0, i64* [[GEP]], align 8
+// CHECK:   ret void
+// CHECK: }
+// CHECK-LABEL: define void @test_union_het_fp() {{.*}}{
+// CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 8
+// CHECK:   [[CALL:%.*]] = call swiftcc i64 @return_union_het_fp()
+// CHECK:   [[T0:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64 }*
+// CHECK:   [[T1:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[T0]], i32 0, i32 0
+// CHECK:   store i64 [[CALL]], i64* [[T1]], align 8
+// CHECK:   [[T0:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64 }*
+// CHECK:   [[T1:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[T0]], i32 0, i32 0
+// CHECK:   [[V0:%.*]] = load i64, i64* [[T1]], align 8
+// CHECK:   call swiftcc void @take_union_het_fp(i64 [[V0]])
+// CHECK:   ret void
+// CHECK: }
+
+
+typedef union {
+  float f1;
+  float f2;
+} union_hom_fp;
+TEST(union_hom_fp)
+// CHECK-LABEL: define void @test_union_hom_fp()
+// CHECK:   [[TMP:%.*]] = alloca [[REC:%.*]], align 4
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] float @return_union_hom_fp()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG:{ float }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store float [[CALL]], float* [[T0]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   call [[SWIFTCC]] void @take_union_hom_fp(float [[FIRST]])
+// CHECK:   ret void
+
+typedef union {
+  float f1;
+  float4 fv2;
+} union_hom_fp_partial;
+TEST(union_hom_fp_partial)
+// CHECK: define void @test_union_hom_fp_partial()
+// CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 16
+// CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_union_hom_fp_partial()
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue { i64, i64 } [[CALL]], 0
+// CHECK:   store i64 [[T1]], i64* [[T0]], align 16
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue { i64, i64 } [[CALL]], 1
+// CHECK:   store i64 [[T1]], i64* [[T0]], align 8
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[V0:%.*]] = load i64, i64* [[T0]], align 16
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[V1:%.*]] = load i64, i64* [[T0]], align 8
+// CHECK:   call swiftcc void @take_union_hom_fp_partial(i64 [[V0]], i64 [[V1]])
+// CHECK:   ret void
+// CHECK: }
+
+typedef union {
+  struct { int x, y; } f1;
+  float4 fv2;
+} union_het_fpv_partial;
+TEST(union_het_fpv_partial)
+// CHECK-LABEL: define void @test_union_het_fpv_partial()
+// CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 16
+// CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_union_het_fpv_partial()
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue { i64, i64 } [[CALL]], 0
+// CHECK:   store i64 [[T1]], i64* [[T0]], align 16
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue { i64, i64 } [[CALL]], 1
+// CHECK:   store i64 [[T1]], i64* [[T0]], align 8
+// CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
+// CHECK:   [[V0:%.*]] = load i64, i64* [[T0]], align 16
+// CHECK:   [[T0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 1
+// CHECK:   [[V1:%.*]] = load i64, i64* [[T0]], align 8
+// CHECK:   call swiftcc void @take_union_het_fpv_partial(i64 [[V0]], i64 [[V1]])
+// CHECK:   ret void
+// CHECK: }
+
+/*****************************************************************************/
+/****************************** VECTOR LEGALIZATION **************************/
+/*****************************************************************************/
+
+TEST(int4)
+// CHECK-LABEL: define {{.*}} <4 x i32> @return_int4()
+// CHECK-LABEL: define {{.*}} @take_int4(<4 x i32>
+
+TEST(int8)
+// CHECK-LABEL: define {{.*}} @return_int8()
+// CHECK:   [[RET:%.*]] = alloca [[REC:<8 x i32>]], align 32
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
+// CHECK:   store
+// CHECK:   load
+// CHECK:   store
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ <4 x i32>, <4 x i32> }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ <4 x i32>, <4 x i32> }]] undef, <4 x i32> [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], <4 x i32> [[SECOND]], 1
+// CHECK:   ret [[UAGG]] [[T1]]
+// CHECK-LABEL: define {{.*}} @take_int8(<4 x i32>, <4 x i32>)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store <4 x i32> %0, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store <4 x i32> %1, <4 x i32>* [[T0]], align
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_int8()
+// CHECK:   [[TMP1:%.*]] = alloca [[REC]], align
+// CHECK:   [[TMP2:%.*]] = alloca [[REC]], align
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_int8()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP1]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[V:%.*]] = load [[REC]], [[REC]]* [[TMP1]], align
+// CHECK:   store [[REC]] [[V]], [[REC]]* [[TMP2]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP2]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   call [[SWIFTCC]] void @take_int8(<4 x i32> [[FIRST]], <4 x i32> [[SECOND]])
+// CHECK:   ret void
+
+TEST(int5)
+// CHECK-LABEL: define {{.*}} @return_int5()
+// CHECK:   [[RET:%.*]] = alloca [[REC:<5 x i32>]], align 32
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
+// CHECK:   store
+// CHECK:   load
+// CHECK:   store
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ <4 x i32>, i32 }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ <4 x i32>, i32 }]] undef, <4 x i32> [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i32 [[SECOND]], 1
+// CHECK:   ret [[UAGG]] [[T1]]
+// CHECK-LABEL: define {{.*}} @take_int5(<4 x i32>, i32)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store <4 x i32> %0, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store i32 %1, i32* [[T0]], align
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_int5()
+// CHECK:   [[TMP1:%.*]] = alloca [[REC]], align
+// CHECK:   [[TMP2:%.*]] = alloca [[REC]], align
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_int5()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP1]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store i32 [[T1]], i32* [[T0]], align
+// CHECK:   [[V:%.*]] = load [[REC]], [[REC]]* [[TMP1]], align
+// CHECK:   store [[REC]] [[V]], [[REC]]* [[TMP2]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP2]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   call [[SWIFTCC]] void @take_int5(<4 x i32> [[FIRST]], i32 [[SECOND]])
+// CHECK:   ret void
+
+typedef struct {
+  int x;
+  int3 v __attribute__((packed));
+} misaligned_int3;
+TEST(misaligned_int3)
+// CHECK-LABEL: define swiftcc void @take_misaligned_int3(i64, i64)
diff --git a/test/CodeGen/x86-inline-asm-v-constraint.c b/test/CodeGen/x86-inline-asm-v-constraint.c
new file mode 100644
index 000000000000..d335e4b6a0f7
--- /dev/null
+++ b/test/CodeGen/x86-inline-asm-v-constraint.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu x86-64 -o - |opt -instnamer -S |FileCheck %s --check-prefix SSE
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu skylake -D AVX -o -|opt -instnamer -S  | FileCheck %s --check-prefixes AVX,SSE
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu skylake-avx512 -D AVX512 -D AVX -o -|opt -instnamer -S  | FileCheck %s --check-prefixes AVX512,AVX,SSE
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu knl -D AVX -D AVX512 -o - |opt -instnamer -S  | FileCheck %s --check-prefixes AVX512,AVX,SSE
+
+typedef float __m128 __attribute__ ((vector_size (16)));
+typedef float __m256 __attribute__ ((vector_size (32)));
+typedef float __m512 __attribute__ ((vector_size (64)));
+
+// SSE: call <4 x float> asm "vmovhlps $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(i64 %tmp, <4 x float> %tmp1)
+__m128 testXMM(__m128 _xmm0, long _l) {
+  __asm__("vmovhlps %1, %2, %0" :"=v"(_xmm0) : "v"(_l), "v"(_xmm0));
+  return _xmm0;
+}
+
+// AVX: call <8 x float> asm "vmovsldup $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %tmp)
+__m256 testYMM(__m256 _ymm0) {
+#ifdef AVX
+  __asm__("vmovsldup %1, %0" :"=v"(_ymm0) : "v"(_ymm0));
+#endif
+  return _ymm0;
+}
+
+// AVX512: call <16 x float> asm "vpternlogd $$0, $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %tmp, <16 x float> %tmp1)
+__m512 testZMM(__m512 _zmm0, __m512 _zmm1) {
+#ifdef AVX512
+  __asm__("vpternlogd $0, %1, %2, %0" :"=v"(_zmm0) : "v"(_zmm1), "v"(_zmm0));
+#endif
+  return _zmm0;
+}
diff --git a/test/CodeGen/x86_32-xsave.c b/test/CodeGen/x86_32-xsave.c
index da5d38a49f1b..b475d63fb357 100644
--- a/test/CodeGen/x86_32-xsave.c
+++ b/test/CodeGen/x86_32-xsave.c
@@ -1,18 +1,18 @@
-// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=i686-unknown-unknown -target-feature +xsave -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVE
-// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=i686-unknown-unknown -target-feature +xsave -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVE
+// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=i686-unknown-unknown -target-feature +xsave -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVE
+// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=i686-unknown-unknown -target-feature +xsave -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVE
 
-// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEOPT
-// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEOPT
+// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEOPT
+// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEOPT
 
-// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsavec -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEC
-// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsavec -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEC
+// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsavec -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEC
+// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsavec -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEC
 
-// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaves -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVES
-// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaves -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVES
+// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaves -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVES
+// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaves -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVES
 
 void test() {
-  unsigned long long tmp_ULLi;
-  void*              tmp_vp;
+  unsigned long long tmp_ULLi = 0;
+  void*              tmp_vp = 0;
 
 #ifdef TEST_XSAVE
 // XSAVE: [[tmp_vp_1:%[0-9a-zA-z]+]] = load i8*, i8** %tmp_vp, align 4
diff --git a/test/CodeGen/x86_64-arguments.c b/test/CodeGen/x86_64-arguments.c
index a2d60cc6b1ab..9f375d780c94 100644
--- a/test/CodeGen/x86_64-arguments.c
+++ b/test/CodeGen/x86_64-arguments.c
@@ -470,13 +470,14 @@ typedef struct {
 s512 x55;
 __m512 x56;
 
-// Even on AVX512, aggregates of size larger than four eightbytes have class
-// MEMORY (AVX512 draft 0.3 3.2.3p2 Rule 1).
+// On AVX512, aggregates which contain a __m512 type are classified as SSE/SSEUP
+// as per https://github.com/hjl-tools/x86-psABI/commit/30f9c9 3.2.3p2 Rule 1
 //
-// CHECK: declare void @f55(%struct.s512* byval align 64)
+// AVX512: declare void @f55(<16 x float>)
+// NO-AVX512: declare void @f55(%struct.s512* byval align 64)
 void f55(s512 x);
 
-// However, __m512 has type SSE/SSEUP on AVX512.
+// __m512 has type SSE/SSEUP on AVX512.
 //
 // AVX512: declare void @f56(<16 x float>)
 // NO-AVX512: declare void @f56(<16 x float>* byval align 64)
@@ -535,3 +536,12 @@ void f64() {
   f64_helper(x64, x64, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0i);
   f64_helper(x64, x64, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0i);
 }
+
+struct t65 {
+  __m256 m;
+  int : 0;
+};
+// SSE-LABEL: @f65(%struct.t65* byval align 32 %{{[^,)]+}})
+// AVX: @f65(<8 x float> %{{[^,)]+}})
+void f65(struct t65 a0) {
+}
diff --git a/test/CodeGen/x86_64-xsave.c b/test/CodeGen/x86_64-xsave.c
index ecdb7252e362..496e982b99c5 100644
--- a/test/CodeGen/x86_64-xsave.c
+++ b/test/CodeGen/x86_64-xsave.c
@@ -1,18 +1,18 @@
-// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVE
-// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVE
+// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVE
+// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVE
 
-// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEOPT
-// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEOPT
+// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEOPT
+// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEOPT
 
-// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsavec -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEC
-// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsavec -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEC
+// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsavec -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEC
+// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsavec -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEC
 
-// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaves -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVES
-// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaves -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVES
+// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaves -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVES
+// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaves -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVES
 
 void test() {
-  unsigned long long tmp_ULLi;
-  void*              tmp_vp;
+  unsigned long long tmp_ULLi = 0;
+  void*              tmp_vp = 0;
 
 #ifdef TEST_XSAVE
 // XSAVE: [[tmp_vp_1:%[0-9a-zA-z]+]] = load i8*, i8** %tmp_vp, align 8
diff --git a/test/CodeGen/x86_inlineasm_curly_bracket_escape.c b/test/CodeGen/x86_inlineasm_curly_bracket_escape.c
new file mode 100644
index 000000000000..503c13e61379
--- /dev/null
+++ b/test/CodeGen/x86_inlineasm_curly_bracket_escape.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -O0  -S -emit-llvm -o - -Wall -Werror | FileCheck %s
+// This test checks validity of inline assembly using curly brackets syntax
+// for extended inline asm.
+
+void test_curly_brackets() {
+    //CHECK:  %xmm1,%xmm0,%xmm1 {%k1}{z}
+    asm("vpaddb\t %%xmm1,%%xmm0,%%xmm1 %{%%k1%}%{z%}\t":::);
+}
+\ No newline at end of file
diff --git a/test/CodeGen/xop-builtins.c b/test/CodeGen/xop-builtins.c
index 3b1d1ef39cfc..da9a3b925de2 100644
--- a/test/CodeGen/xop-builtins.c
+++ b/test/CodeGen/xop-builtins.c
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-// Don't include mm_malloc.h, it's system specific.
-#define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
diff --git a/test/CodeGen/xray-attributes-supported-arm.cpp b/test/CodeGen/xray-attributes-supported-arm.cpp
new file mode 100644
index 000000000000..3104f285bfb3
--- /dev/null
+++ b/test/CodeGen/xray-attributes-supported-arm.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -fxray-instrument -std=c++11 -x c++ -emit-llvm -o - -triple arm-unknown-linux-gnu | FileCheck %s
+
+// Make sure that the LLVM attribute for XRay-annotated functions do show up.
+[[clang::xray_always_instrument]] void foo() {
+// CHECK: define void @_Z3foov() #0
+};
+
+[[clang::xray_never_instrument]] void bar() {
+// CHECK: define void @_Z3barv() #1
+};
+
+// CHECK: #0 = {{.*}}"function-instrument"="xray-always"
+// CHECK: #1 = {{.*}}"function-instrument"="xray-never"