aboutsummaryrefslogtreecommitdiff
path: root/lib/Headers
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Headers')
-rw-r--r--lib/Headers/CMakeLists.txt6
-rw-r--r--lib/Headers/Intrin.h11
-rw-r--r--lib/Headers/__stddef_max_align_t.h9
-rw-r--r--lib/Headers/altivec.h1348
-rw-r--r--lib/Headers/arm_acle.h68
-rw-r--r--lib/Headers/avx2intrin.h37
-rw-r--r--lib/Headers/avx512bwintrin.h438
-rw-r--r--lib/Headers/avx512dqintrin.h237
-rw-r--r--lib/Headers/avx512erintrin.h326
-rw-r--r--lib/Headers/avx512fintrin.h995
-rw-r--r--lib/Headers/avx512vlbwintrin.h774
-rw-r--r--lib/Headers/avx512vldqintrin.h349
-rw-r--r--lib/Headers/avx512vlintrin.h1236
-rw-r--r--lib/Headers/avxintrin.h196
-rw-r--r--lib/Headers/cuda_builtin_vars.h110
-rw-r--r--lib/Headers/emmintrin.h84
-rw-r--r--lib/Headers/htmintrin.h226
-rw-r--r--lib/Headers/htmxlintrin.h363
-rw-r--r--lib/Headers/immintrin.h8
-rw-r--r--lib/Headers/module.modulemap15
-rw-r--r--lib/Headers/s390intrin.h35
-rw-r--r--lib/Headers/stdatomic.h4
-rw-r--r--lib/Headers/unwind.h18
-rw-r--r--lib/Headers/xmmintrin.h2
24 files changed, 6499 insertions, 396 deletions
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt
index 080550f7c77f..29a738e7a81a 100644
--- a/lib/Headers/CMakeLists.txt
+++ b/lib/Headers/CMakeLists.txt
@@ -9,15 +9,20 @@ set(files
avx512fintrin.h
avx512vlbwintrin.h
avx512vlintrin.h
+ avx512dqintrin.h
+ avx512vldqintrin.h
avxintrin.h
bmi2intrin.h
bmiintrin.h
cpuid.h
+ cuda_builtin_vars.h
emmintrin.h
f16cintrin.h
float.h
fma4intrin.h
fmaintrin.h
+ htmintrin.h
+ htmxlintrin.h
ia32intrin.h
immintrin.h
Intrin.h
@@ -34,6 +39,7 @@ set(files
prfchwintrin.h
rdseedintrin.h
rtmintrin.h
+ s390intrin.h
shaintrin.h
smmintrin.h
stdalign.h
diff --git a/lib/Headers/Intrin.h b/lib/Headers/Intrin.h
index 84bc4303a133..727a55e5b761 100644
--- a/lib/Headers/Intrin.h
+++ b/lib/Headers/Intrin.h
@@ -289,6 +289,7 @@ void _WriteBarrier(void);
unsigned __int32 xbegin(void);
void _xend(void);
static __inline__
+#define _XCR_XFEATURE_ENABLED_MASK 0
unsigned __int64 __cdecl _xgetbv(unsigned int);
void __cdecl _xrstor(void const *, unsigned __int64);
void __cdecl _xsave(void *, unsigned __int64);
@@ -780,17 +781,17 @@ _InterlockedCompareExchange64(__int64 volatile *_Destination,
\*----------------------------------------------------------------------------*/
#if defined(__i386__) || defined(__x86_64__)
static __inline__ void __attribute__((__always_inline__, __nodebug__))
-__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
_ReadWriteBarrier(void) {
__asm__ volatile ("" : : : "memory");
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
-__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
_ReadBarrier(void) {
__asm__ volatile ("" : : : "memory");
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
-__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
_WriteBarrier(void) {
__asm__ volatile ("" : : : "memory");
}
@@ -943,14 +944,14 @@ __readmsr(unsigned long __register) {
return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
}
-static __inline__ unsigned long __attribute__((always_inline, __nodebug__))
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__readcr3(void) {
unsigned long __cr3_val;
__asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
return __cr3_val;
}
-static __inline__ void __attribute__((always_inline, __nodebug__))
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
__writecr3(unsigned int __cr3_val) {
__asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
}
diff --git a/lib/Headers/__stddef_max_align_t.h b/lib/Headers/__stddef_max_align_t.h
index a06f412c53fb..1e10ca9865c0 100644
--- a/lib/Headers/__stddef_max_align_t.h
+++ b/lib/Headers/__stddef_max_align_t.h
@@ -26,15 +26,18 @@
#ifndef __CLANG_MAX_ALIGN_T_DEFINED
#define __CLANG_MAX_ALIGN_T_DEFINED
-#ifndef _MSC_VER
+#if defined(_MSC_VER)
+typedef double max_align_t;
+#elif defined(__APPLE__)
+typedef long double max_align_t;
+#else
+// Define 'max_align_t' to match the GCC definition.
typedef struct {
long long __clang_max_align_nonce1
__attribute__((__aligned__(__alignof__(long long))));
long double __clang_max_align_nonce2
__attribute__((__aligned__(__alignof__(long double))));
} max_align_t;
-#else
-typedef double max_align_t;
#endif
#endif
diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index 0ac0841ae482..1f8c831bb739 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h
@@ -73,6 +73,18 @@ vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c);
static vector float __ATTRS_o_ai
vec_perm(vector float __a, vector float __b, vector unsigned char __c);
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai
+vec_perm(vector long long __a, vector long long __b, vector unsigned char __c);
+
+static vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+ vector unsigned char __c);
+
+static vector double __ATTRS_o_ai
+vec_perm(vector double __a, vector double __b, vector unsigned char __c);
+#endif
+
static vector unsigned char __ATTRS_o_ai
vec_xor(vector unsigned char __a, vector unsigned char __b);
@@ -245,6 +257,20 @@ vec_add(vector unsigned int __a, vector bool int __b)
return __a + (vector unsigned int)__b;
}
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_add(vector signed __int128 __a, vector signed __int128 __b)
+{
+ return __a + __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_add(vector unsigned __int128 __a, vector unsigned __int128 __b)
+{
+ return __a + __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
static vector float __ATTRS_o_ai
vec_add(vector float __a, vector float __b)
{
@@ -383,12 +409,24 @@ vec_vaddfp(vector float __a, vector float __b)
/* vec_addc */
-static vector unsigned int __attribute__((__always_inline__))
+static vector unsigned int __ATTRS_o_ai
vec_addc(vector unsigned int __a, vector unsigned int __b)
{
return __builtin_altivec_vaddcuw(__a, __b);
}
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_addc(vector signed __int128 __a, vector signed __int128 __b) {
+ return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_addc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+ return __builtin_altivec_vaddcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
/* vec_vaddcuw */
static vector unsigned int __attribute__((__always_inline__))
@@ -627,6 +665,64 @@ vec_vadduws(vector unsigned int __a, vector bool int __b)
return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
}
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vadduqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vadduqm(vector signed __int128 __a, vector signed __int128 __b)
+{
+ return __a + __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vadduqm(vector unsigned __int128 __a, vector unsigned __int128 __b)
+{
+ return __a + __b;
+}
+
+/* vec_vaddeuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector signed __int128 __a, vector signed __int128 __b,
+ vector signed __int128 __c) {
+ return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+ vector unsigned __int128 __c) {
+ return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+/* vec_vaddcuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddcuq(vector signed __int128 __a, vector signed __int128 __b)
+{
+ return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddcuq(vector unsigned __int128 __a, vector unsigned __int128 __b)
+{
+ return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+/* vec_vaddecuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddecuq(vector signed __int128 __a, vector signed __int128 __b,
+ vector signed __int128 __c) {
+ return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+ vector unsigned __int128 __c) {
+ return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
/* vec_and */
#define __builtin_altivec_vand vec_and
@@ -1387,6 +1483,21 @@ vec_cmpeq(vector unsigned int __a, vector unsigned int __b)
__builtin_altivec_vcmpequw((vector int)__a, (vector int)__b);
}
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector signed long long __a, vector signed long long __b)
+{
+ return (vector bool long long) __builtin_altivec_vcmpequd(__a, __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return (vector bool long long)
+ __builtin_altivec_vcmpequd((vector long long)__a, (vector long long) __b);
+}
+#endif
+
static vector bool int __ATTRS_o_ai
vec_cmpeq(vector float __a, vector float __b)
{
@@ -1447,6 +1558,20 @@ vec_cmpgt(vector unsigned int __a, vector unsigned int __b)
return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
}
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector signed long long __a, vector signed long long __b)
+{
+ return (vector bool long long)__builtin_altivec_vcmpgtsd(__a, __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return (vector bool long long)__builtin_altivec_vcmpgtud(__a, __b);
+}
+#endif
+
static vector bool int __ATTRS_o_ai
vec_cmpgt(vector float __a, vector float __b)
{
@@ -2270,7 +2395,7 @@ vec_vlogefp(vector float __a)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsl(int __a, const signed char *__b)
{
@@ -2289,7 +2414,7 @@ vec_lvsl(int __a, const signed char *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsl(int __a, const unsigned char *__b)
{
@@ -2308,7 +2433,7 @@ vec_lvsl(int __a, const unsigned char *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsl(int __a, const short *__b)
{
@@ -2327,7 +2452,7 @@ vec_lvsl(int __a, const short *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsl(int __a, const unsigned short *__b)
{
@@ -2346,7 +2471,7 @@ vec_lvsl(int __a, const unsigned short *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsl(int __a, const int *__b)
{
@@ -2365,7 +2490,7 @@ vec_lvsl(int __a, const int *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsl(int __a, const unsigned int *__b)
{
@@ -2384,7 +2509,7 @@ vec_lvsl(int __a, const unsigned int *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsl(int __a, const float *__b)
{
@@ -2405,7 +2530,7 @@ vec_lvsl(int __a, const float *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsr(int __a, const signed char *__b)
{
@@ -2424,7 +2549,7 @@ vec_lvsr(int __a, const signed char *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsr(int __a, const unsigned char *__b)
{
@@ -2443,7 +2568,7 @@ vec_lvsr(int __a, const unsigned char *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsr(int __a, const short *__b)
{
@@ -2462,7 +2587,7 @@ vec_lvsr(int __a, const short *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsr(int __a, const unsigned short *__b)
{
@@ -2481,7 +2606,7 @@ vec_lvsr(int __a, const unsigned short *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsr(int __a, const int *__b)
{
@@ -2500,7 +2625,7 @@ vec_lvsr(int __a, const int *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsr(int __a, const unsigned int *__b)
{
@@ -2519,7 +2644,7 @@ vec_lvsr(int __a, const unsigned int *__b)
#ifdef __LITTLE_ENDIAN__
static vector unsigned char __ATTRS_o_ai
-__attribute__((deprecated("use assignment for unaligned little endian \
+__attribute__((__deprecated__("use assignment for unaligned little endian \
loads/stores")))
vec_lvsr(int __a, const float *__b)
{
@@ -2679,6 +2804,20 @@ vec_max(vector unsigned int __a, vector bool int __b)
return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
}
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vmaxsd(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vmaxud(__a, __b);
+}
+#endif
+
static vector float __ATTRS_o_ai
vec_max(vector float __a, vector float __b)
{
@@ -3327,6 +3466,20 @@ vec_min(vector unsigned int __a, vector bool int __b)
return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
}
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vminsd(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vminud(__a, __b);
+}
+#endif
+
static vector float __ATTRS_o_ai
vec_min(vector float __a, vector float __b)
{
@@ -3762,6 +3915,28 @@ vec_mule(vector unsigned short __a, vector unsigned short __b)
#endif
}
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_mule(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vmulosw(__a, __b);
+#else
+ return __builtin_altivec_vmulesw(__a, __b);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mule(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vmulouw(__a, __b);
+#else
+ return __builtin_altivec_vmuleuw(__a, __b);
+#endif
+}
+#endif
+
/* vec_vmulesb */
static vector short __attribute__((__always_inline__))
@@ -3852,6 +4027,28 @@ vec_mulo(vector unsigned short __a, vector unsigned short __b)
#endif
}
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_mulo(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vmulesw(__a, __b);
+#else
+ return __builtin_altivec_vmulosw(__a, __b);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mulo(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vmuleuw(__a, __b);
+#else
+ return __builtin_altivec_vmulouw(__a, __b);
+#endif
+}
+#endif
+
/* vec_vmulosb */
static vector short __attribute__((__always_inline__))
@@ -4525,6 +4722,58 @@ vec_vpkuwum(vector bool int __a, vector bool int __b)
#endif
}
+/* vec_vpkudum */
+
+#ifdef __POWER8_VECTOR__
+#define __builtin_altivec_vpkudum vec_vpkudum
+
+static vector int __ATTRS_o_ai
+vec_vpkudum(vector long long __a, vector long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return (vector int)vec_perm(__a, __b, (vector unsigned char)
+ (0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+ 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+ return (vector int)vec_perm(__a, __b, (vector unsigned char)
+ (0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vpkudum(vector unsigned long long __a, vector unsigned long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return (vector unsigned int)vec_perm(__a, __b, (vector unsigned char)
+ (0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+ 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+ return (vector unsigned int)vec_perm(__a, __b, (vector unsigned char)
+ (0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vpkudum(vector bool long long __a, vector bool long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return (vector bool int)vec_perm((vector long long)__a,
+ (vector long long)__b,
+ (vector unsigned char)
+ (0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+ 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+ return (vector bool int)vec_perm((vector long long)__a,
+ (vector long long)__b,
+ (vector unsigned char)
+ (0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+#endif
+
/* vec_packpx */
static vector pixel __attribute__((__always_inline__))
@@ -4591,6 +4840,28 @@ vec_packs(vector unsigned int __a, vector unsigned int __b)
#endif
}
+#ifdef __POWER8_VECTOR__
+static vector int __ATTRS_o_ai
+vec_packs(vector long long __a, vector long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vpksdss(__b, __a);
+#else
+ return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_packs(vector unsigned long long __a, vector unsigned long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vpkudus(__b, __a);
+#else
+ return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
/* vec_vpkshss */
static vector signed char __attribute__((__always_inline__))
@@ -4603,6 +4874,20 @@ vec_vpkshss(vector short __a, vector short __b)
#endif
}
+/* vec_vpksdss */
+
+#ifdef __POWER8_VECTOR__
+static vector int __ATTRS_o_ai
+vec_vpksdss(vector long long __a, vector long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vpksdss(__b, __a);
+#else
+ return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+#endif
+
/* vec_vpkuhus */
static vector unsigned char __attribute__((__always_inline__))
@@ -4615,6 +4900,20 @@ vec_vpkuhus(vector unsigned short __a, vector unsigned short __b)
#endif
}
+/* vec_vpkudus */
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __attribute__((__always_inline__))
+vec_vpkudus(vector unsigned long long __a, vector unsigned long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vpkudus(__b, __a);
+#else
+ return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
/* vec_vpkswss */
static vector signed short __attribute__((__always_inline__))
@@ -4681,6 +4980,28 @@ vec_packsu(vector unsigned int __a, vector unsigned int __b)
#endif
}
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __ATTRS_o_ai
+vec_packsu(vector long long __a, vector long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vpksdus(__b, __a);
+#else
+ return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vpkudus(__b, __a);
+#else
+ return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
/* vec_vpkshus */
static vector unsigned char __ATTRS_o_ai
@@ -4725,6 +5046,20 @@ vec_vpkswus(vector unsigned int __a, vector unsigned int __b)
#endif
}
+/* vec_vpksdus */
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __ATTRS_o_ai
+vec_vpksdus(vector long long __a, vector long long __b)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vpksdus(__b, __a);
+#else
+ return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+#endif
+
/* vec_perm */
// The vperm instruction is defined architecturally with a big-endian bias.
@@ -5095,6 +5430,20 @@ vec_rl(vector unsigned int __a, vector unsigned int __b)
return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
}
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_rl(vector signed long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vrld(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_rl(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vrld(__a, __b);
+}
+#endif
+
/* vec_vrlb */
static vector signed char __ATTRS_o_ai
@@ -5465,6 +5814,20 @@ vec_sl(vector unsigned int __a, vector unsigned int __b)
return __a << __b;
}
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sl(vector signed long long __a, vector unsigned long long __b)
+{
+ return __a << (vector long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sl(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __a << __b;
+}
+#endif
+
/* vec_vslb */
#define __builtin_altivec_vslb vec_vslb
@@ -6566,6 +6929,20 @@ vec_sr(vector unsigned int __a, vector unsigned int __b)
return __a >> __b;
}
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sr(vector signed long long __a, vector unsigned long long __b)
+{
+ return __a >> (vector long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sr(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __a >> __b;
+}
+#endif
+
/* vec_vsrb */
#define __builtin_altivec_vsrb vec_vsrb
@@ -6652,6 +7029,20 @@ vec_sra(vector unsigned int __a, vector unsigned int __b)
return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
}
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sra(vector signed long long __a, vector unsigned long long __b)
+{
+ return __a >> __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sra(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return (vector unsigned long long) ( (vector signed long long) __a >> __b);
+}
+#endif
+
/* vec_vsrab */
static vector signed char __ATTRS_o_ai
@@ -8224,6 +8615,20 @@ vec_sub(vector unsigned int __a, vector bool int __b)
return __a - (vector unsigned int)__b;
}
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_sub(vector signed __int128 __a, vector signed __int128 __b)
+{
+ return __a - __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_sub(vector unsigned __int128 __a, vector unsigned __int128 __b)
+{
+ return __a - __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
static vector float __ATTRS_o_ai
vec_sub(vector float __a, vector float __b)
{
@@ -8362,12 +8767,26 @@ vec_vsubfp(vector float __a, vector float __b)
/* vec_subc */
-static vector unsigned int __attribute__((__always_inline__))
+static vector unsigned int __ATTRS_o_ai
vec_subc(vector unsigned int __a, vector unsigned int __b)
{
return __builtin_altivec_vsubcuw(__a, __b);
}
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector unsigned __int128 __ATTRS_o_ai
+vec_subc(vector unsigned __int128 __a, vector unsigned __int128 __b)
+{
+ return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static vector signed __int128 __ATTRS_o_ai
+vec_subc(vector signed __int128 __a, vector signed __int128 __b)
+{
+ return __builtin_altivec_vsubcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
/* vec_vsubcuw */
static vector unsigned int __attribute__((__always_inline__))
@@ -8606,6 +9025,68 @@ vec_vsubuws(vector unsigned int __a, vector bool int __b)
return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
}
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vsubuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubuqm(vector signed __int128 __a, vector signed __int128 __b)
+{
+ return __a - __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b)
+{
+ return __a - __b;
+}
+
+/* vec_vsubeuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
+ vector signed __int128 __c)
+{
+ return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+ vector unsigned __int128 __c)
+{
+ return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+/* vec_vsubcuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubcuq(vector signed __int128 __a, vector signed __int128 __b)
+{
+ return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubcuq(vector unsigned __int128 __a, vector unsigned __int128 __b)
+{
+ return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+/* vec_vsubecuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubecuq(vector signed __int128 __a, vector signed __int128 __b,
+ vector signed __int128 __c)
+{
+ return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+ vector unsigned __int128 __c)
+{
+ return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
/* vec_sum4s */
static vector int __ATTRS_o_ai
@@ -8797,6 +9278,28 @@ vec_unpackh(vector pixel __a)
#endif
}
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai
+vec_unpackh(vector int __a)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vupklsw(__a);
+#else
+ return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_unpackh(vector bool int __a)
+{
+#ifdef __LITTLE_ENDIAN__
+ return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+ return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+#endif
+
/* vec_vupkhsb */
static vector short __ATTRS_o_ai
@@ -8851,6 +9354,30 @@ vec_vupkhsh(vector pixel __a)
#endif
}
+/* vec_vupkhsw */
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai
+vec_vupkhsw(vector int __a)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vupklsw(__a);
+#else
+ return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_vupkhsw(vector bool int __a)
+{
+#ifdef __LITTLE_ENDIAN__
+ return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+ return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+#endif
+
/* vec_unpackl */
static vector short __ATTRS_o_ai
@@ -8903,6 +9430,28 @@ vec_unpackl(vector pixel __a)
#endif
}
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai
+vec_unpackl(vector int __a)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vupkhsw(__a);
+#else
+ return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_unpackl(vector bool int __a)
+{
+#ifdef __LITTLE_ENDIAN__
+ return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+ return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+#endif
+
/* vec_vupklsb */
static vector short __ATTRS_o_ai
@@ -8957,6 +9506,30 @@ vec_vupklsh(vector pixel __a)
#endif
}
+/* vec_vupklsw */
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai
+vec_vupklsw(vector int __a)
+{
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vupkhsw(__a);
+#else
+ return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_vupklsw(vector bool int __a)
+{
+#ifdef __LITTLE_ENDIAN__
+ return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+ return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+#endif
+
/* vec_vsx_ld */
#ifdef __VSX__
@@ -10887,6 +11460,55 @@ vec_all_eq(vector bool int __a, vector bool int __b)
return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_eq(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+ (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+ (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool long long __a, vector long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+ (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+ (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+ (vector long long)__b);
+}
+#endif
+
static int __ATTRS_o_ai
vec_all_eq(vector float __a, vector float __b)
{
@@ -11033,6 +11655,56 @@ vec_all_ge(vector bool int __a, vector bool int __b)
(vector unsigned int)__a);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_ge(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __b, __a);
+}
+static int __ATTRS_o_ai
+vec_all_ge(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, (vector signed long long)__b,
+ __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+ __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ,
+ (vector unsigned long long)__b,
+ (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b,
+ (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ,
+ (vector unsigned long long)__b,
+ (vector unsigned long long)__a);
+}
+#endif
+
static int __ATTRS_o_ai
vec_all_ge(vector float __a, vector float __b)
{
@@ -11179,6 +11851,56 @@ vec_all_gt(vector bool int __a, vector bool int __b)
(vector unsigned int)__b);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_gt(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, __b);
+}
+static int __ATTRS_o_ai
+vec_all_gt(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a,
+ (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT,
+ (vector unsigned long long)__a,
+ (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+ __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT,
+ (vector unsigned long long)__a,
+ (vector unsigned long long)__b);
+}
+#endif
+
static int __ATTRS_o_ai
vec_all_gt(vector float __a, vector float __b)
{
@@ -11333,6 +12055,57 @@ vec_all_le(vector bool int __a, vector bool int __b)
(vector unsigned int)__b);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_le(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a,
+ (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ,
+ (vector unsigned long long)__a,
+ (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+ __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ,
+ (vector unsigned long long)__a,
+ (vector unsigned long long)__b);
+}
+#endif
+
static int __ATTRS_o_ai
vec_all_le(vector float __a, vector float __b)
{
@@ -11479,6 +12252,57 @@ vec_all_lt(vector bool int __a, vector bool int __b)
(vector unsigned int)__a);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_lt(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_LT, (vector signed long long)__b,
+ __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+ __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT,
+ (vector unsigned long long)__b,
+ (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b,
+ (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT,
+ (vector unsigned long long)__b,
+ (vector unsigned long long)__a);
+}
+#endif
+
static int __ATTRS_o_ai
vec_all_lt(vector float __a, vector float __b)
{
@@ -11633,6 +12457,56 @@ vec_all_ne(vector bool int __a, vector bool int __b)
return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_ne(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector long long)__a,
+ (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+#endif
+
static int __ATTRS_o_ai
vec_all_ne(vector float __a, vector float __b)
{
@@ -11837,6 +12711,61 @@ vec_any_eq(vector bool int __a, vector bool int __b)
__builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_eq(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector long long)__a,
+ (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned long long __a, vector bool long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool long long __a, vector signed long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool long long __a, vector unsigned long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool long long __a, vector bool long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+#endif
+
static int __ATTRS_o_ai
vec_any_eq(vector float __a, vector float __b)
{
@@ -11985,6 +12914,57 @@ vec_any_ge(vector bool int __a, vector bool int __b)
(vector unsigned int)__a);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_ge(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV,
+ (vector signed long long)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+ (vector unsigned long long)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+ (vector unsigned long long)__b,
+ (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b,
+ (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+ (vector unsigned long long)__b,
+ (vector unsigned long long)__a);
+}
+#endif
+
static int __ATTRS_o_ai
vec_any_ge(vector float __a, vector float __b)
{
@@ -12135,6 +13115,58 @@ vec_any_gt(vector bool int __a, vector bool int __b)
(vector unsigned int)__b);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_gt(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a,
+ (vector signed long long)__b);
+}
+
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a,
+ (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+ (vector unsigned long long)__a,
+ (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+ (vector unsigned long long)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+ (vector unsigned long long)__a,
+ (vector unsigned long long)__b);
+}
+#endif
+
static int __ATTRS_o_ai
vec_any_gt(vector float __a, vector float __b)
{
@@ -12285,6 +13317,57 @@ vec_any_le(vector bool int __a, vector bool int __b)
(vector unsigned int)__b);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_le(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a,
+ (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+ (vector unsigned long long)__a,
+ (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+ (vector unsigned long long)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+ (vector unsigned long long)__a,
+ (vector unsigned long long)__b);
+}
+#endif
+
static int __ATTRS_o_ai
vec_any_le(vector float __a, vector float __b)
{
@@ -12435,6 +13518,57 @@ vec_any_lt(vector bool int __a, vector bool int __b)
(vector unsigned int)__a);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_lt(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV,
+ (vector signed long long)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+ (vector unsigned long long)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+ (vector unsigned long long)__b,
+ (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool long long __a, vector unsigned long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b,
+ (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+ (vector unsigned long long)__b,
+ (vector unsigned long long)__a);
+}
+#endif
+
static int __ATTRS_o_ai
vec_any_lt(vector float __a, vector float __b)
{
@@ -12607,6 +13741,61 @@ vec_any_ne(vector bool int __a, vector bool int __b)
__builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
}
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_ne(vector signed long long __a, vector signed long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned long long __a, vector unsigned long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
+ (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector signed long long __a, vector bool long long __b)
+{
+ return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned long long __a, vector bool long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool long long __a, vector signed long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool long long __a, vector unsigned long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool long long __a, vector bool long long __b)
+{
+ return
+ __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a,
+ (vector signed long long)__b);
+}
+#endif
+
static int __ATTRS_o_ai
vec_any_ne(vector float __a, vector float __b)
{
@@ -12661,6 +13850,133 @@ vec_any_out(vector float __a, vector float __b)
return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
}
+/* Power 8 Crypto functions
+Note: We diverge from the current GCC implementation with regard
+to cryptography and related functions as follows:
+- Only the SHA and AES instructions and builtins are disabled by -mno-crypto
+- The remaining ones are only available on Power8 and up so
+ require -mpower8-vector
+The justification for this is that export requirements require that
+Category:Vector.Crypto is optional (i.e. compliant hardware may not provide
+support). As a result, we need to be able to turn off support for those.
+The remaining ones (currently controlled by -mcrypto for GCC) still
+need to be provided on compliant hardware even if Vector.Crypto is not
+provided.
+FIXME: the naming convention for the builtins will be adjusted due
+to the inconsistency (__builtin_crypto_ prefix on builtins that cannot be
+removed with -mno-crypto). This is under development.
+*/
+#ifdef __CRYPTO__
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vsbox (vector unsigned long long __a)
+{
+ return __builtin_altivec_crypto_vsbox(__a);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipher (vector unsigned long long __a,
+ vector unsigned long long __b)
+{
+ return __builtin_altivec_crypto_vcipher(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipherlast (vector unsigned long long __a,
+ vector unsigned long long __b)
+{
+ return __builtin_altivec_crypto_vcipherlast(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipher (vector unsigned long long __a,
+ vector unsigned long long __b)
+{
+ return __builtin_altivec_crypto_vncipher(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipherlast (vector unsigned long long __a,
+ vector unsigned long long __b)
+{
+ return __builtin_altivec_crypto_vncipherlast(__a, __b);
+}
+
+
+#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
+#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpermxor (vector unsigned char __a,
+ vector unsigned char __b,
+ vector unsigned char __c)
+{
+ return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpermxor (vector unsigned short __a,
+ vector unsigned short __b,
+ vector unsigned short __c)
+{
+ return (vector unsigned short)
+ __builtin_altivec_crypto_vpermxor((vector unsigned char) __a,
+ (vector unsigned char) __b,
+ (vector unsigned char) __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+__builtin_crypto_vpermxor (vector unsigned int __a,
+ vector unsigned int __b,
+ vector unsigned int __c)
+{
+ return (vector unsigned int)
+ __builtin_altivec_crypto_vpermxor((vector unsigned char) __a,
+ (vector unsigned char) __b,
+ (vector unsigned char) __c);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpermxor (vector unsigned long long __a,
+ vector unsigned long long __b,
+ vector unsigned long long __c)
+{
+ return (vector unsigned long long)
+ __builtin_altivec_crypto_vpermxor((vector unsigned char) __a,
+ (vector unsigned char) __b,
+ (vector unsigned char) __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpmsumb (vector unsigned char __a,
+ vector unsigned char __b)
+{
+ return __builtin_altivec_crypto_vpmsumb(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpmsumb (vector unsigned short __a,
+ vector unsigned short __b)
+{
+ return __builtin_altivec_crypto_vpmsumh(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+__builtin_crypto_vpmsumb (vector unsigned int __a,
+ vector unsigned int __b)
+{
+ return __builtin_altivec_crypto_vpmsumw(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpmsumb (vector unsigned long long __a,
+ vector unsigned long long __b)
+{
+ return __builtin_altivec_crypto_vpmsumd(__a, __b);
+}
+#endif
+
#undef __ATTRS_o_ai
#endif /* __ALTIVEC_H */
diff --git a/lib/Headers/arm_acle.h b/lib/Headers/arm_acle.h
index 814df2c3d782..6c56f3b77812 100644
--- a/lib/Headers/arm_acle.h
+++ b/lib/Headers/arm_acle.h
@@ -45,23 +45,23 @@ extern "C" {
/* 8.4 Hints */
#if !defined(_MSC_VER)
-static __inline__ void __attribute__((always_inline, nodebug)) __wfi(void) {
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
__builtin_arm_wfi();
}
-static __inline__ void __attribute__((always_inline, nodebug)) __wfe(void) {
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
__builtin_arm_wfe();
}
-static __inline__ void __attribute__((always_inline, nodebug)) __sev(void) {
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
__builtin_arm_sev();
}
-static __inline__ void __attribute__((always_inline, nodebug)) __sevl(void) {
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
__builtin_arm_sevl();
}
-static __inline__ void __attribute__((always_inline, nodebug)) __yield(void) {
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
__builtin_arm_yield();
}
#endif
@@ -71,7 +71,7 @@ static __inline__ void __attribute__((always_inline, nodebug)) __yield(void) {
#endif
/* 8.5 Swap */
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__swp(uint32_t x, volatile uint32_t *p) {
uint32_t v;
do v = __builtin_arm_ldrex(p); while (__builtin_arm_strex(x, p));
@@ -102,28 +102,28 @@ static __inline__ uint32_t __attribute__((always_inline, nodebug))
#endif
/* 8.7 NOP */
-static __inline__ void __attribute__((always_inline, nodebug)) __nop(void) {
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
__builtin_arm_nop();
}
/* 9 DATA-PROCESSING INTRINSICS */
/* 9.2 Miscellaneous data-processing intrinsics */
/* ROR */
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__ror(uint32_t x, uint32_t y) {
y %= 32;
if (y == 0) return x;
return (x >> y) | (x << (32 - y));
}
-static __inline__ uint64_t __attribute__((always_inline, nodebug))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rorll(uint64_t x, uint32_t y) {
y %= 64;
if (y == 0) return x;
return (x >> y) | (x << (64 - y));
}
-static __inline__ unsigned long __attribute__((always_inline, nodebug))
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rorl(unsigned long x, uint32_t y) {
#if __SIZEOF_LONG__ == 4
return __ror(x, y);
@@ -134,28 +134,28 @@ static __inline__ unsigned long __attribute__((always_inline, nodebug))
/* CLZ */
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__clz(uint32_t t) {
return __builtin_clz(t);
}
-static __inline__ unsigned long __attribute__((always_inline, nodebug))
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__clzl(unsigned long t) {
return __builtin_clzl(t);
}
-static __inline__ uint64_t __attribute__((always_inline, nodebug))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__clzll(uint64_t t) {
return __builtin_clzll(t);
}
/* REV */
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rev(uint32_t t) {
return __builtin_bswap32(t);
}
-static __inline__ unsigned long __attribute__((always_inline, nodebug))
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__revl(unsigned long t) {
#if __SIZEOF_LONG__ == 4
return __builtin_bswap32(t);
@@ -164,40 +164,40 @@ static __inline__ unsigned long __attribute__((always_inline, nodebug))
#endif
}
-static __inline__ uint64_t __attribute__((always_inline, nodebug))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__revll(uint64_t t) {
return __builtin_bswap64(t);
}
/* REV16 */
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rev16(uint32_t t) {
return __ror(__rev(t), 16);
}
-static __inline__ unsigned long __attribute__((always_inline, nodebug))
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rev16l(unsigned long t) {
return __rorl(__revl(t), sizeof(long) / 2);
}
-static __inline__ uint64_t __attribute__((always_inline, nodebug))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rev16ll(uint64_t t) {
return __rorll(__revll(t), 32);
}
/* REVSH */
-static __inline__ int16_t __attribute__((always_inline, nodebug))
+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
__revsh(int16_t t) {
return __builtin_bswap16(t);
}
/* RBIT */
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rbit(uint32_t t) {
return __builtin_arm_rbit(t);
}
-static __inline__ uint64_t __attribute__((always_inline, nodebug))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rbitll(uint64_t t) {
#if __ARM_32BIT_STATE
return (((uint64_t) __builtin_arm_rbit(t)) << 32) |
@@ -207,7 +207,7 @@ static __inline__ uint64_t __attribute__((always_inline, nodebug))
#endif
}
-static __inline__ unsigned long __attribute__((always_inline, nodebug))
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rbitl(unsigned long t) {
#if __SIZEOF_LONG__ == 4
return __rbit(t);
@@ -230,17 +230,17 @@ static __inline__ unsigned long __attribute__((always_inline, nodebug))
/* 9.4.2 Saturating addition and subtraction intrinsics */
#if __ARM_32BIT_STATE
-static __inline__ int32_t __attribute__((always_inline, nodebug))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qadd(int32_t t, int32_t v) {
return __builtin_arm_qadd(t, v);
}
-static __inline__ int32_t __attribute__((always_inline, nodebug))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qsub(int32_t t, int32_t v) {
return __builtin_arm_qsub(t, v);
}
-static __inline__ int32_t __attribute__((always_inline, nodebug))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qdbl(int32_t t) {
return __builtin_arm_qadd(t, t);
}
@@ -248,42 +248,42 @@ __qdbl(int32_t t) {
/* 9.7 CRC32 intrinsics */
#if __ARM_FEATURE_CRC32
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32b(uint32_t a, uint8_t b) {
return __builtin_arm_crc32b(a, b);
}
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32h(uint32_t a, uint16_t b) {
return __builtin_arm_crc32h(a, b);
}
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32w(uint32_t a, uint32_t b) {
return __builtin_arm_crc32w(a, b);
}
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32d(uint32_t a, uint64_t b) {
return __builtin_arm_crc32d(a, b);
}
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32cb(uint32_t a, uint8_t b) {
return __builtin_arm_crc32cb(a, b);
}
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32ch(uint32_t a, uint16_t b) {
return __builtin_arm_crc32ch(a, b);
}
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32cw(uint32_t a, uint32_t b) {
return __builtin_arm_crc32cw(a, b);
}
-static __inline__ uint32_t __attribute__((always_inline, nodebug))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32cd(uint32_t a, uint64_t b) {
return __builtin_arm_crc32cd(a, b);
}
diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h
index 394fdfee9652..e1e639de1ba8 100644
--- a/lib/Headers/avx2intrin.h
+++ b/lib/Headers/avx2intrin.h
@@ -160,7 +160,7 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \
__m256i __V1 = (V1); \
__m256i __V2 = (V2); \
- (__m256d)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \
+ (__m256i)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \
(((M) & 0x01) ? 16 : 0), \
(((M) & 0x02) ? 17 : 1), \
(((M) & 0x04) ? 18 : 2), \
@@ -542,6 +542,8 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
__m256i __a = (a); \
(__m256i)__builtin_ia32_pslldqi256(__a, (count)*8); })
+#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_slli_epi16(__m256i __a, int __count)
{
@@ -606,6 +608,8 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
__m256i __a = (a); \
(__m256i)__builtin_ia32_psrldqi256(__a, (count)*8); })
+#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_srli_epi16(__m256i __a, int __count)
{
@@ -756,6 +760,12 @@ _mm_broadcastss_ps(__m128 __X)
return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X);
}
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastsd_pd(__m128d __a)
+{
+ return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_broadcastss_ps(__m128 __X)
{
@@ -771,7 +781,7 @@ _mm256_broadcastsd_pd(__m128d __X)
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_broadcastsi128_si256(__m128i __X)
{
- return (__m256i)__builtin_ia32_vbroadcastsi256(__X);
+ return (__m256i)__builtin_shufflevector(__X, __X, 0, 1, 0, 1);
}
#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
@@ -874,14 +884,21 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256 __b)
__m256i __V2 = (V2); \
(__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); })
-#define _mm256_extracti128_si256(A, O) __extension__ ({ \
- __m256i __A = (A); \
- (__m128i)__builtin_ia32_extract128i256(__A, (O)); })
-
-#define _mm256_inserti128_si256(V1, V2, O) __extension__ ({ \
- __m256i __V1 = (V1); \
- __m128i __V2 = (V2); \
- (__m256i)__builtin_ia32_insert128i256(__V1, __V2, (O)); })
+#define _mm256_extracti128_si256(V, M) __extension__ ({ \
+ (__m128i)__builtin_shufflevector( \
+ (__v4di)(V), \
+ (__v4di)(_mm256_setzero_si256()), \
+ (((M) & 1) ? 2 : 0), \
+ (((M) & 1) ? 3 : 1) );})
+
+#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
+ (__m256i)__builtin_shufflevector( \
+ (__v4di)(V1), \
+ (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+ (((M) & 1) ? 0 : 4), \
+ (((M) & 1) ? 1 : 5), \
+ (((M) & 1) ? 4 : 2), \
+ (((M) & 1) ? 5 : 3) );})
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_maskload_epi32(int const *__X, __m256i __M)
diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h
index bc4d4ac6afdd..d0591e406f7b 100644
--- a/lib/Headers/avx512bwintrin.h
+++ b/lib/Headers/avx512bwintrin.h
@@ -21,15 +21,37 @@
*
*===-----------------------------------------------------------------------===
*/
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
#ifndef __AVX512BWINTRIN_H
#define __AVX512BWINTRIN_H
typedef unsigned int __mmask32;
typedef unsigned long long __mmask64;
-typedef char __v64qi __attribute__ ((vector_size (64)));
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+static __inline __v64qi __attribute__ ((__always_inline__, __nodebug__))
+_mm512_setzero_qi (void) {
+ return (__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline __v32hi __attribute__ ((__always_inline__, __nodebug__))
+_mm512_setzero_hi (void) {
+ return (__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
/* Integer compare */
@@ -45,6 +67,18 @@ _mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
__u);
}
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+ __u);
+}
+
static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
@@ -57,4 +91,406 @@ _mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
__u);
}
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+ (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+ return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+ return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+ __u);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_add_epi8 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v64qi) __A + (__v64qi) __B);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_qi (),
+ (__mmask64) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_sub_epi8 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v64qi) __A - (__v64qi) __B);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_qi (),
+ (__mmask64) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_add_epi16 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v32hi) __A + (__v32hi) __B);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_hi (),
+ (__mmask32) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_sub_epi16 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v32hi) __A - (__v32hi) __B);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_hi (),
+ (__mmask32) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v32hi) __A * (__v32hi) __B);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_hi (),
+ (__mmask32) __U);
+}
+
+#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+ (__v64qi)(__m512i)(b), \
+ (p), (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+ (__v64qi)(__m512i)(b), \
+ (p), (__mmask64)(m)); })
+
+#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+ (__v64qi)(__m512i)(b), \
+ (p), (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+ (__v64qi)(__m512i)(b), \
+ (p), (__mmask64)(m)); })
+
+#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+ (__v32hi)(__m512i)(b), \
+ (p), (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+ (__v32hi)(__m512i)(b), \
+ (p), (__mmask32)(m)); })
+
+#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+ (__v32hi)(__m512i)(b), \
+ (p), (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+ (__v32hi)(__m512i)(b), \
+ (p), (__mmask32)(m)); })
+
#endif
diff --git a/lib/Headers/avx512dqintrin.h b/lib/Headers/avx512dqintrin.h
new file mode 100644
index 000000000000..fd33be2f44a6
--- /dev/null
+++ b/lib/Headers/avx512dqintrin.h
@@ -0,0 +1,237 @@
+/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512DQINTRIN_H
+#define __AVX512DQINTRIN_H
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v8di) __A * (__v8di) __B);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
+ return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_xor_pd (__m512d __A, __m512d __B) {
+ return (__m512d) ((__v8di) __A ^ (__v8di) __B);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_xor_ps (__m512 __A, __m512 __B) {
+ return (__m512) ((__v16si) __A ^ (__v16si) __B);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_or_pd (__m512d __A, __m512d __B) {
+ return (__m512d) ((__v8di) __A | (__v8di) __B);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_or_ps (__m512 __A, __m512 __B) {
+ return (__m512) ((__v16si) __A | (__v16si) __B);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_and_pd (__m512d __A, __m512d __B) {
+ return (__m512d) ((__v8di) __A & (__v8di) __B);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_and_ps (__m512 __A, __m512 __B) {
+ return (__m512) ((__v16si) __A & (__v16si) __B);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_andnot_pd (__m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_andnot_ps (__m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+#endif
diff --git a/lib/Headers/avx512erintrin.h b/lib/Headers/avx512erintrin.h
index 1a5ea153adf3..57c61aa0e112 100644
--- a/lib/Headers/avx512erintrin.h
+++ b/lib/Headers/avx512erintrin.h
@@ -28,85 +28,259 @@
#define __AVX512ERINTRIN_H
+// exp2a23
+#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(S), \
+ (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(M), (R)); })
+
+#define _mm512_exp2a23_pd(A) \
+ _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+ _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+ _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(S), \
+ (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask8)(M), (R)); })
+
+#define _mm512_exp2a23_ps(A) \
+ _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+ _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+ _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
// rsqrt28
-static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
-_mm512_rsqrt28_round_pd (__m512d __A, int __R)
-{
- return (__m512d)__builtin_ia32_rsqrt28pd_mask ((__v8df)__A,
- (__v8df)_mm512_setzero_pd(),
- (__mmask8)-1,
- __R);
-}
-static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
-_mm512_rsqrt28_round_ps(__m512 __A, int __R)
-{
- return (__m512)__builtin_ia32_rsqrt28ps_mask ((__v16sf)__A,
- (__v16sf)_mm512_setzero_ps(),
- (__mmask16)-1,
- __R);
-}
-
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_rsqrt28_round_ss(__m128 __A, __m128 __B, int __R)
-{
- return (__m128) __builtin_ia32_rsqrt28ss_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) -1,
- __R);
-}
-
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
-_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R)
-{
- return (__m128d) __builtin_ia32_rsqrt28sd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1,
- __R);
-}
+#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(S), \
+ (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(M), (R)); })
+
+#define _mm512_rsqrt28_pd(A) \
+ _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+ _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+ _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (R)); })
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(S), \
+ (__mmask16)(M), (R)); })
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(M), (R)); })
+
+#define _mm512_rsqrt28_ps(A) \
+ _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+ _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+ _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+ (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+ (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(S), \
+ (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+ (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(M), (R)); })
+
+#define _mm_rsqrt28_ss(A, B) \
+ _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+ _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+ _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+ (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+ (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(S), \
+ (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+ (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(M), (R)); })
+
+#define _mm_rsqrt28_sd(A, B) \
+ _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+ _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+ _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
// rcp28
-static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
-_mm512_rcp28_round_pd (__m512d __A, int __R)
-{
- return (__m512d)__builtin_ia32_rcp28pd_mask ((__v8df)__A,
- (__v8df)_mm512_setzero_pd(),
- (__mmask8)-1,
- __R);
-}
-
-static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
-_mm512_rcp28_round_ps (__m512 __A, int __R)
-{
- return (__m512)__builtin_ia32_rcp28ps_mask ((__v16sf)__A,
- (__v16sf)_mm512_setzero_ps (),
- (__mmask16)-1,
- __R);
-}
-
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R)
-{
- return (__m128) __builtin_ia32_rcp28ss_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) -1,
- __R);
-}
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
-_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R)
-{
- return (__m128d) __builtin_ia32_rcp28sd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1,
- __R);
-}
+#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(S), \
+ (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(M), (R)); })
+
+#define _mm512_rcp28_pd(A) \
+ _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+ _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+ _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (R)); })
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(S), \
+ (__mmask16)(M), (R)); })
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(M), (R)); })
+
+#define _mm512_rcp28_ps(A) \
+ _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+ _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+ _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+ (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+ (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(S), \
+ (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+ (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(M), (R)); })
+
+#define _mm_rcp28_ss(A, B) \
+ _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+ _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+ _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+ (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+ (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(S), \
+ (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+ (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(M), (R)); })
+
+#define _mm_rcp28_sd(A, B) \
+ _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+ _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+ _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#endif // __AVX512ERINTRIN_H
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index 9c80710110b0..d299704d9b2b 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -162,6 +162,224 @@ _mm512_castps512_ps128(__m512 __a)
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
}
+/* Bitwise operators */
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_and_epi32(__m512i __a, __m512i __b)
+{
+ return __a & __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
+ (__v16si) __b,
+ (__v16si) __src,
+ (__mmask16) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
+ (__v16si) __b,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_and_epi64(__m512i __a, __m512i __b)
+{
+ return __a & __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
+ (__v8di) __b,
+ (__v8di) __src,
+ (__mmask8) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
+ (__v8di) __b,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_andnot_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W, __U);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_pd (),
+ __U);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_or_epi32(__m512i __a, __m512i __b)
+{
+ return __a | __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
+ (__v16si) __b,
+ (__v16si) __src,
+ (__mmask16) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
+ (__v16si) __b,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_or_epi64(__m512i __a, __m512i __b)
+{
+ return __a | __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
+ (__v8di) __b,
+ (__v8di) __src,
+ (__mmask8) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
+ (__v8di) __b,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_xor_epi32(__m512i __a, __m512i __b)
+{
+ return __a ^ __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
+ (__v16si) __b,
+ (__v16si) __src,
+ (__mmask16) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
+ (__v16si) __b,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_xor_epi64(__m512i __a, __m512i __b)
+{
+ return __a ^ __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
+ (__v8di) __b,
+ (__v8di) __src,
+ (__mmask8) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
+ (__v8di) __b,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_and_si512(__m512i __a, __m512i __b)
+{
+ return __a & __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_or_si512(__m512i __a, __m512i __b)
+{
+ return __a | __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_xor_si512(__m512i __a, __m512i __b)
+{
+ return __a ^ __b;
+}
/* Arithmetic */
static __inline __m512d __attribute__((__always_inline__, __nodebug__))
@@ -200,6 +418,106 @@ _mm512_sub_ps(__m512 __a, __m512 __b)
return __a - __b;
}
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8di) __A + (__v8di) __B);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8di) __A - (__v8di) __B);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16si) __A + (__v16si) __B);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16si) __A - (__v16si) __B);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_max_pd(__m512d __A, __m512d __B)
{
@@ -337,6 +655,24 @@ _mm512_mul_epi32(__m512i __X, __m512i __Y)
}
static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di) __W, __M);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
_mm512_mul_epu32(__m512i __X, __m512i __Y)
{
return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
@@ -346,6 +682,48 @@ _mm512_mul_epu32(__m512i __X, __m512i __Y)
(__mmask8) -1);
}
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di) __W, __M);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16si) __A * (__v16si) __B);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W, __M);
+}
+
static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_sqrt_pd(__m512d a)
{
@@ -492,20 +870,13 @@ _mm512_abs_epi32(__m512i __A)
(__mmask16) -1);
}
-static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
-_mm512_roundscale_ps(__m512 __A, const int __imm)
-{
- return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
- (__v16sf) __A, -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
-_mm512_roundscale_pd(__m512d __A, const int __imm)
-{
- return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
- (__v8df) __A, -1,
- _MM_FROUND_CUR_DIRECTION);
-}
+#define _mm512_roundscale_ps(A, B) __extension__ ({ \
+ (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(A), (B), (__v16sf)(A), \
+ -1, _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_roundscale_pd(A, B) __extension__ ({ \
+ (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(A), (B), (__v8df)(A), \
+ -1, _MM_FROUND_CUR_DIRECTION); })
static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
@@ -613,25 +984,35 @@ _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
(__mmask16) -1);
}
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_valign_epi64(__m512i __A, __m512i __B, const int __I)
-{
- return (__m512i) __builtin_ia32_alignq512_mask((__v8di)__A,
- (__v8di)__B,
- __I,
- (__v8di)_mm512_setzero_si512(),
- (__mmask8) -1);
-}
-
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_valign_epi32(__m512i __A, __m512i __B, const int __I)
-{
- return (__m512i)__builtin_ia32_alignd512_mask((__v16si)__A,
- (__v16si)__B,
- __I,
- (__v16si)_mm512_setzero_si512(),
- (__mmask16) -1);
-}
+#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
+ (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), \
+ (I), (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1); })
+
+#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
+ (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), \
+ (I), (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)-1); })
+
+/* Vector Extract */
+
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \
+ __m512d __A = (A); \
+ (__m256d) \
+ __builtin_ia32_extractf64x4_mask((__v8df)__A, \
+ (I), \
+ (__v4df)_mm256_setzero_si256(), \
+ (__mmask8) -1); })
+
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \
+ __m512 __A = (A); \
+ (__m128) \
+ __builtin_ia32_extractf32x4_mask((__v16sf)__A, \
+ (I), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8) -1); })
/* Vector Blend */
@@ -669,22 +1050,37 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
/* Compare */
-static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cmp_ps_mask(__m512 a, __m512 b, const int p)
-{
- return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) a,
- (__v16sf) b, p, (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
+#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (P), (__mmask16)-1, (R)); })
-static __inline __mmask8 __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cmp_pd_mask(__m512d __X, __m512d __Y, const int __P)
-{
- return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
- (__v8df) __Y, __P,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
+#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (P), (__mmask16)(U), (R)); })
+
+#define _mm512_cmp_ps_mask(A, B, P) \
+ _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
+ _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (P), (__mmask8)-1, (R)); })
+
+#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (P), (__mmask8)(U), (R)); })
+
+#define _mm512_cmp_pd_mask(A, B, P) \
+ _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
+ _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
/* Conversion */
@@ -698,25 +1094,15 @@ _mm512_cvttps_epu32(__m512 __A)
_MM_FROUND_CUR_DIRECTION);
}
-static __inline __m512 __attribute__ (( __always_inline__, __nodebug__))
-_mm512_cvt_roundepi32_ps(__m512i __A, const int __R)
-{
- return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) -1,
- __R);
-}
+#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (R)); })
-static __inline __m512 __attribute__ (( __always_inline__, __nodebug__))
-_mm512_cvt_roundepu32_ps(__m512i __A, const int __R)
-{
- return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) -1,
- __R);
-}
+#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
+ (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (R)); })
static __inline __m512d __attribute__ (( __always_inline__, __nodebug__))
_mm512_cvtepi32_pd(__m256i __A)
@@ -735,25 +1121,16 @@ _mm512_cvtepu32_pd(__m256i __A)
_mm512_setzero_pd (),
(__mmask8) -1);
}
-static __inline __m256 __attribute__ (( __always_inline__, __nodebug__))
-_mm512_cvt_roundpd_ps(__m512d __A, const int __R)
-{
- return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) -1,
- __R);
-}
-static __inline __m256i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cvtps_ph(__m512 __A, const int __I)
-{
- return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
- __I,
- (__v16hi)
- _mm256_setzero_si256 (),
- -1);
-}
+#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
+ (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(A), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm512_cvtps_ph(A, I) __extension__ ({ \
+ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(A), (I), \
+ (__v16hi)_mm256_setzero_si256(), \
+ -1); })
static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
_mm512_cvtph_ps(__m256i __A)
@@ -783,61 +1160,35 @@ _mm512_cvttpd_epi32(__m512d a)
_MM_FROUND_CUR_DIRECTION);
}
-static __inline __m256i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cvtt_roundpd_epi32(__m512d __A, const int __R)
-{
- return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) -1,
- __R);
-}
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cvtt_roundps_epi32(__m512 __A, const int __R)
-{
- return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1,
- __R);
-}
+#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
+ (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)-1, (R)); })
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cvt_roundps_epi32(__m512 __A, const int __R)
-{
- return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1,
- __R);
-}
-static __inline __m256i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cvt_roundpd_epi32(__m512d __A, const int __R)
-{
- return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) -1,
- __R);
-}
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cvt_roundps_epu32(__m512 __A, const int __R)
-{
- return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1,
- __R);
-}
-static __inline __m256i __attribute__ ((__always_inline__, __nodebug__))
-_mm512_cvt_roundpd_epu32(__m512d __A, const int __R)
-{
- return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) -1,
- __R);
-}
+#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
+ (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
+ (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
+ (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)-1, (R)); })
+
+#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
+ (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
+ (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8) -1, (R)); })
/* Unpack and Interleave */
static __inline __m512d __attribute__((__always_inline__, __nodebug__))
@@ -928,12 +1279,30 @@ _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
(__mmask8) __U);
}
+static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
static __inline __m512d __attribute__((__always_inline__, __nodebug__))
_mm512_loadu_pd(double const *__p)
{
struct __loadu_pd {
__m512d __v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
return ((struct __loadu_pd*)__p)->__v;
}
@@ -942,10 +1311,28 @@ _mm512_loadu_ps(float const *__p)
{
struct __loadu_ps {
__m512 __v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
return ((struct __loadu_ps*)__p)->__v;
}
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_load_ps(double const *__p)
+{
+ return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_load_pd(float const *__p)
+{
+ return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
/* SIMD store ops */
static __inline void __attribute__ ((__always_inline__, __nodebug__))
@@ -988,9 +1375,9 @@ _mm512_storeu_ps(void *__P, __m512 __A)
}
static __inline void __attribute__ ((__always_inline__, __nodebug__))
-_mm512_store_ps(void *__P, __m512 __A)
+_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
{
- *(__m512*)__P = __A;
+ __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
}
static __inline void __attribute__ ((__always_inline__, __nodebug__))
@@ -999,6 +1386,19 @@ _mm512_store_pd(void *__P, __m512d __A)
*(__m512d*)__P = __A;
}
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+ __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_store_ps(void *__P, __m512 __A)
+{
+ *(__m512*)__P = __A;
+}
+
/* Mask ops */
static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
@@ -1021,6 +1421,18 @@ _mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
__u);
}
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+ __u);
+}
+
static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
@@ -1033,4 +1445,303 @@ _mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
(__mmask8)-1);
}
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+ return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+ __u);
+}
+
+#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
+ __m512i __a = (a); \
+ __m512i __b = (b); \
+ (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+ (__mmask16)-1); })
+
+#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
+ __m512i __a = (a); \
+ __m512i __b = (b); \
+ (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+ (__mmask16)-1); })
+
+#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
+ __m512i __a = (a); \
+ __m512i __b = (b); \
+ (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+ (__mmask8)-1); })
+
+#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
+ __m512i __a = (a); \
+ __m512i __b = (b); \
+ (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+ (__mmask8)-1); })
+
+#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+ __m512i __a = (a); \
+ __m512i __b = (b); \
+ (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+ (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+ __m512i __a = (a); \
+ __m512i __b = (b); \
+ (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+ (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+ __m512i __a = (a); \
+ __m512i __b = (b); \
+ (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+ (__mmask8)(m)); })
+
+#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+ __m512i __a = (a); \
+ __m512i __b = (b); \
+ (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+ (__mmask8)(m)); })
#endif // __AVX512FINTRIN_H
diff --git a/lib/Headers/avx512vlbwintrin.h b/lib/Headers/avx512vlbwintrin.h
index 11333f851756..c3b087e303b4 100644
--- a/lib/Headers/avx512vlbwintrin.h
+++ b/lib/Headers/avx512vlbwintrin.h
@@ -42,6 +42,17 @@ _mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
__u);
}
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+ __u);
+}
static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
@@ -55,6 +66,18 @@ _mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
__u);
}
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+ __u);
+}
+
static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
@@ -67,6 +90,17 @@ _mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
__u);
}
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+ __u);
+}
static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
@@ -80,4 +114,744 @@ _mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
__u);
}
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+ return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+ (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+ return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+ (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+ return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+ __u);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+ return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), \
+ (p), (__mmask16)-1); })
+
+#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), \
+ (p), (__mmask16)(m)); })
+
+#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), \
+ (p), (__mmask16)-1); })
+
+#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), \
+ (p), (__mmask16)(m)); })
+
+#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \
+ (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), \
+ (p), (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+ (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), \
+ (p), (__mmask32)(m)); })
+
+#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \
+ (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), \
+ (p), (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+ (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), \
+ (p), (__mmask32)(m)); })
+
+#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+ (__v8hi)(__m128i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+ (__v8hi)(__m128i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+ (__v8hi)(__m128i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+ (__v8hi)(__m128i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+ (__v16hi)(__m256i)(b), \
+ (p), (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+ (__v16hi)(__m256i)(b), \
+ (p), (__mmask16)(m)); })
+
+#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+ (__v16hi)(__m256i)(b), \
+ (p), (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+ (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+ (__v16hi)(__m256i)(b), \
+ (p), (__mmask16)(m)); })
+
#endif /* __AVX512VLBWINTRIN_H */
diff --git a/lib/Headers/avx512vldqintrin.h b/lib/Headers/avx512vldqintrin.h
new file mode 100644
index 000000000000..4024446a3a6a
--- /dev/null
+++ b/lib/Headers/avx512vldqintrin.h
@@ -0,0 +1,349 @@
+/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLDQINTRIN_H
+#define __AVX512VLDQINTRIN_H
+
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
+ return (__m256i) ((__v4di) __A * (__v4di) __B);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mullo_epi64 (__m128i __A, __m128i __B) {
+ return (__m128i) ((__v2di) __A * (__v2di) __B);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B) {
+ return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+#endif
diff --git a/lib/Headers/avx512vlintrin.h b/lib/Headers/avx512vlintrin.h
index 8a374b102676..9de0cf418b78 100644
--- a/lib/Headers/avx512vlintrin.h
+++ b/lib/Headers/avx512vlintrin.h
@@ -42,6 +42,17 @@ _mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
__u);
}
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+ __u);
+}
static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
@@ -56,6 +67,18 @@ _mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
}
static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
(__mmask8)-1);
@@ -67,6 +90,17 @@ _mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
__u);
}
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+ __u);
+}
static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
@@ -80,4 +114,1206 @@ _mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
__u);
}
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+ __u);
+}
+
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+ __u);
+}
+
+
+
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+ __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+ (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+ return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+ __u);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W, __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_pd (),
+ __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W, __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_pd (),
+ __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W, __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_pd (),
+ __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W, __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_pd (),
+ __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+ (__v4si)(__m128i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+ (__v4si)(__m128i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+ (__v4si)(__m128i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+ (__v4si)(__m128i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+ (__v8si)(__m256i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+ (__v8si)(__m256i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+ (__v8si)(__m256i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+ (__v8si)(__m256i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+ (__v2di)(__m128i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+ (__v2di)(__m128i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+ (__v2di)(__m128i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+ (__v2di)(__m128i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+ (__v4di)(__m256i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+ (__v4di)(__m256i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+ (__v4di)(__m256i)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+ (__v4di)(__m256i)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_ps_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+ (__v8sf)(__m256)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_ps_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+ (__v8sf)(__m256)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_pd_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \
+ (__v4df)(__m256)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_pd_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \
+ (__v4df)(__m256)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm128_cmp_ps_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+ (__v4sf)(__m128)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm128_mask_cmp_ps_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+ (__v4sf)(__m128)(b), \
+ (p), (__mmask8)(m)); })
+
+#define _mm128_cmp_pd_mask(a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
+ (__v2df)(__m128)(b), \
+ (p), (__mmask8)-1); })
+
+#define _mm128_mask_cmp_pd_mask(m, a, b, p) __extension__ ({ \
+ (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
+ (__v2df)(__m128)(b), \
+ (p), (__mmask8)(m)); })
#endif /* __AVX512VLINTRIN_H */
diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h
index 4e1044af5611..4907965861d4 100644
--- a/lib/Headers/avxintrin.h
+++ b/lib/Headers/avxintrin.h
@@ -257,8 +257,7 @@ _mm_permutevar_ps(__m128 __a, __m128i __c)
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_permutevar_ps(__m256 __a, __m256i __c)
{
- return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a,
- (__v8si)__c);
+ return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
}
#define _mm_permute_pd(A, C) __extension__ ({ \
@@ -430,35 +429,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
__m128 __b = (b); \
(__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
-/* Vector extract */
-#define _mm256_extractf128_pd(A, O) __extension__ ({ \
- __m256d __A = (A); \
- (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
-
-#define _mm256_extractf128_ps(A, O) __extension__ ({ \
- __m256 __A = (A); \
- (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
-
-#define _mm256_extractf128_si256(A, O) __extension__ ({ \
- __m256i __A = (A); \
- (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
-
static __inline int __attribute__((__always_inline__, __nodebug__))
-_mm256_extract_epi32(__m256i __a, int const __imm)
+_mm256_extract_epi32(__m256i __a, const int __imm)
{
__v8si __b = (__v8si)__a;
return __b[__imm & 7];
}
static __inline int __attribute__((__always_inline__, __nodebug__))
-_mm256_extract_epi16(__m256i __a, int const __imm)
+_mm256_extract_epi16(__m256i __a, const int __imm)
{
__v16hi __b = (__v16hi)__a;
return __b[__imm & 15];
}
static __inline int __attribute__((__always_inline__, __nodebug__))
-_mm256_extract_epi8(__m256i __a, int const __imm)
+_mm256_extract_epi8(__m256i __a, const int __imm)
{
__v32qi __b = (__v32qi)__a;
return __b[__imm & 31];
@@ -473,22 +459,6 @@ _mm256_extract_epi64(__m256i __a, const int __imm)
}
#endif
-/* Vector insert */
-#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
- __m256d __V1 = (V1); \
- __m128d __V2 = (V2); \
- (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
-
-#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
- __m256 __V1 = (V1); \
- __m128 __V2 = (V2); \
- (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
-
-#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
- __m256i __V1 = (V1); \
- __m128i __V2 = (V2); \
- (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
-
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
{
@@ -515,7 +485,7 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
#ifdef __x86_64__
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
-_mm256_insert_epi64(__m256i __a, int __b, int const __imm)
+_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
{
__v4di __c = (__v4di)__a;
__c[__imm & 3] = __b;
@@ -785,7 +755,7 @@ _mm256_loadu_pd(double const *__p)
{
struct __loadu_pd {
__m256d __v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
return ((struct __loadu_pd*)__p)->__v;
}
@@ -794,7 +764,7 @@ _mm256_loadu_ps(float const *__p)
{
struct __loadu_ps {
__m256 __v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
return ((struct __loadu_ps*)__p)->__v;
}
@@ -809,7 +779,7 @@ _mm256_loadu_si256(__m256i const *__p)
{
struct __loadu_si256 {
__m256i __v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
return ((struct __loadu_si256*)__p)->__v;
}
@@ -935,23 +905,23 @@ _mm256_set_pd(double __a, double __b, double __c, double __d)
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_set_ps(float __a, float __b, float __c, float __d,
- float __e, float __f, float __g, float __h)
+ float __e, float __f, float __g, float __h)
{
return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
}
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
- int __i4, int __i5, int __i6, int __i7)
+ int __i4, int __i5, int __i6, int __i7)
{
return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
}
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
- short __w11, short __w10, short __w09, short __w08,
- short __w07, short __w06, short __w05, short __w04,
- short __w03, short __w02, short __w01, short __w00)
+ short __w11, short __w10, short __w09, short __w08,
+ short __w07, short __w06, short __w05, short __w04,
+ short __w03, short __w02, short __w01, short __w00)
{
return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
@@ -959,13 +929,13 @@ _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
- char __b27, char __b26, char __b25, char __b24,
- char __b23, char __b22, char __b21, char __b20,
- char __b19, char __b18, char __b17, char __b16,
- char __b15, char __b14, char __b13, char __b12,
- char __b11, char __b10, char __b09, char __b08,
- char __b07, char __b06, char __b05, char __b04,
- char __b03, char __b02, char __b01, char __b00)
+ char __b27, char __b26, char __b25, char __b24,
+ char __b23, char __b22, char __b21, char __b20,
+ char __b19, char __b18, char __b17, char __b16,
+ char __b15, char __b14, char __b13, char __b12,
+ char __b11, char __b10, char __b09, char __b08,
+ char __b07, char __b06, char __b05, char __b04,
+ char __b03, char __b02, char __b01, char __b00)
{
return (__m256i)(__v32qi){
__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
@@ -990,23 +960,23 @@ _mm256_setr_pd(double __a, double __b, double __c, double __d)
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_setr_ps(float __a, float __b, float __c, float __d,
- float __e, float __f, float __g, float __h)
+ float __e, float __f, float __g, float __h)
{
return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
}
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
- int __i4, int __i5, int __i6, int __i7)
+ int __i4, int __i5, int __i6, int __i7)
{
return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
}
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
- short __w11, short __w10, short __w09, short __w08,
- short __w07, short __w06, short __w05, short __w04,
- short __w03, short __w02, short __w01, short __w00)
+ short __w11, short __w10, short __w09, short __w08,
+ short __w07, short __w06, short __w05, short __w04,
+ short __w03, short __w02, short __w01, short __w00)
{
return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
@@ -1014,19 +984,19 @@ _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
- char __b27, char __b26, char __b25, char __b24,
- char __b23, char __b22, char __b21, char __b20,
- char __b19, char __b18, char __b17, char __b16,
- char __b15, char __b14, char __b13, char __b12,
- char __b11, char __b10, char __b09, char __b08,
- char __b07, char __b06, char __b05, char __b04,
- char __b03, char __b02, char __b01, char __b00)
+ char __b27, char __b26, char __b25, char __b24,
+ char __b23, char __b22, char __b21, char __b20,
+ char __b19, char __b18, char __b17, char __b16,
+ char __b15, char __b14, char __b13, char __b12,
+ char __b11, char __b10, char __b09, char __b08,
+ char __b07, char __b06, char __b05, char __b04,
+ char __b03, char __b02, char __b01, char __b00)
{
return (__m256i)(__v32qi){
__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
- __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
- __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
- __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+ __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
+ __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
+ __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
}
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
@@ -1167,6 +1137,70 @@ _mm256_castsi128_si256(__m128i __a)
return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
}
+/*
+ Vector insert.
+ We use macros rather than inlines because we only want to accept
+ invocations where the immediate M is a constant expression.
+*/
+#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
+ (__m256)__builtin_shufflevector( \
+ (__v8sf)(V1), \
+ (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
+ (((M) & 1) ? 0 : 8), \
+ (((M) & 1) ? 1 : 9), \
+ (((M) & 1) ? 2 : 10), \
+ (((M) & 1) ? 3 : 11), \
+ (((M) & 1) ? 8 : 4), \
+ (((M) & 1) ? 9 : 5), \
+ (((M) & 1) ? 10 : 6), \
+ (((M) & 1) ? 11 : 7) );})
+
+#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
+ (__m256d)__builtin_shufflevector( \
+ (__v4df)(V1), \
+ (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
+ (((M) & 1) ? 0 : 4), \
+ (((M) & 1) ? 1 : 5), \
+ (((M) & 1) ? 4 : 2), \
+ (((M) & 1) ? 5 : 3) );})
+
+#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
+ (__m256i)__builtin_shufflevector( \
+ (__v4di)(V1), \
+ (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+ (((M) & 1) ? 0 : 4), \
+ (((M) & 1) ? 1 : 5), \
+ (((M) & 1) ? 4 : 2), \
+ (((M) & 1) ? 5 : 3) );})
+
+/*
+ Vector extract.
+ We use macros rather than inlines because we only want to accept
+ invocations where the immediate M is a constant expression.
+*/
+#define _mm256_extractf128_ps(V, M) __extension__ ({ \
+ (__m128)__builtin_shufflevector( \
+ (__v8sf)(V), \
+ (__v8sf)(_mm256_setzero_ps()), \
+ (((M) & 1) ? 4 : 0), \
+ (((M) & 1) ? 5 : 1), \
+ (((M) & 1) ? 6 : 2), \
+ (((M) & 1) ? 7 : 3) );})
+
+#define _mm256_extractf128_pd(V, M) __extension__ ({ \
+ (__m128d)__builtin_shufflevector( \
+ (__v4df)(V), \
+ (__v4df)(_mm256_setzero_pd()), \
+ (((M) & 1) ? 2 : 0), \
+ (((M) & 1) ? 3 : 1) );})
+
+#define _mm256_extractf128_si256(V, M) __extension__ ({ \
+ (__m128i)__builtin_shufflevector( \
+ (__v4di)(V), \
+ (__v4di)(_mm256_setzero_si256()), \
+ (((M) & 1) ? 2 : 0), \
+ (((M) & 1) ? 3 : 1) );})
+
/* SIMD load ops (unaligned) */
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
@@ -1195,7 +1229,7 @@ _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
{
struct __loadu_si128 {
__m128i __v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
__m256i __v256 = _mm256_castsi128_si256(
((struct __loadu_si128*)__addr_lo)->__v);
return _mm256_insertf128_si256(__v256,
@@ -1236,4 +1270,34 @@ _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
__builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
}
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set_m128 (__m128 __hi, __m128 __lo) {
+ return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set_m128d (__m128d __hi, __m128d __lo) {
+ return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_m128i (__m128i __hi, __m128i __lo) {
+ return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
+ return _mm256_set_m128(__hi, __lo);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
+ return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
+ return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
#endif /* __AVXINTRIN_H */
diff --git a/lib/Headers/cuda_builtin_vars.h b/lib/Headers/cuda_builtin_vars.h
new file mode 100644
index 000000000000..901356b3d5ce
--- /dev/null
+++ b/lib/Headers/cuda_builtin_vars.h
@@ -0,0 +1,110 @@
+/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_BUILTIN_VARS_H
+#define __CUDA_BUILTIN_VARS_H
+
+// The file implements built-in CUDA variables using __declspec(property).
+// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
+// All read accesses of built-in variable fields get converted into calls to a
+// getter function which in turn would call appropriate builtin to fetch the
+// value.
+//
+// Example:
+// int x = threadIdx.x;
+// IR output:
+// %0 = call i32 @llvm.ptx.read.tid.x() #3
+// PTX output:
+// mov.u32 %r2, %tid.x;
+
+#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC) \
+ __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD; \
+ static inline __attribute__((always_inline)) \
+ __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) { \
+ return INTRINSIC; \
+ }
+
+#if __cplusplus >= 201103L
+#define __DELETE =delete
+#else
+#define __DELETE
+#endif
+
+// Make sure nobody can create instances of the special varible types. nvcc
+// also disallows taking address of special variables, so we disable address-of
+// operator as well.
+#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName) \
+ __attribute__((device)) TypeName() __DELETE; \
+ __attribute__((device)) TypeName(const TypeName &) __DELETE; \
+ __attribute__((device)) void operator=(const TypeName &) const __DELETE; \
+ __attribute__((device)) TypeName *operator&() const __DELETE
+
+struct __cuda_builtin_threadIdx_t {
+ __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_tid_x());
+ __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_tid_y());
+ __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_tid_z());
+private:
+ __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
+};
+
+struct __cuda_builtin_blockIdx_t {
+ __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ctaid_x());
+ __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ctaid_y());
+ __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ctaid_z());
+private:
+ __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
+};
+
+struct __cuda_builtin_blockDim_t {
+ __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ntid_x());
+ __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ntid_y());
+ __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ntid_z());
+private:
+ __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
+};
+
+struct __cuda_builtin_gridDim_t {
+ __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_nctaid_x());
+ __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_nctaid_y());
+ __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_nctaid_z());
+private:
+ __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
+};
+
+#define __CUDA_BUILTIN_VAR \
+ extern const __attribute__((device)) __attribute__((weak))
+__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
+__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
+
+// warpSize should translate to read of %WARP_SZ but there's currently no
+// builtin to do so. According to PTX v4.2 docs 'to date, all target
+// architectures have a WARP_SZ value of 32'.
+__attribute__((device)) const int warpSize = 32;
+
+#undef __CUDA_DEVICE_BUILTIN
+#undef __CUDA_BUILTIN_VAR
+#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
+
+#endif /* __CUDA_BUILTIN_VARS_H */
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index 28d004309cf4..c764d68a185d 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h
@@ -489,7 +489,7 @@ _mm_loadu_pd(double const *__dp)
{
struct __loadu_pd {
__m128d __v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
return ((struct __loadu_pd*)__dp)->__v;
}
@@ -825,11 +825,28 @@ _mm_xor_si128(__m128i __a, __m128i __b)
return __a ^ __b;
}
-#define _mm_slli_si128(a, count) __extension__ ({ \
- _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
- __m128i __a = (a); \
- _Pragma("clang diagnostic pop"); \
- (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
+#define _mm_slli_si128(a, imm) __extension__ ({ \
+ (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \
+ (__v16qi)(__m128i)(a), \
+ ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
+ ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
+
+#define _mm_bslli_si128(a, imm) \
+ _mm_slli_si128((a), (imm))
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i __a, int __count)
@@ -891,12 +908,28 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
}
-
-#define _mm_srli_si128(a, count) __extension__ ({ \
- _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
- __m128i __a = (a); \
- _Pragma("clang diagnostic pop"); \
- (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
+#define _mm_srli_si128(a, imm) __extension__ ({ \
+ (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \
+ (__v16qi)_mm_setzero_si128(), \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
+ ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
+
+#define _mm_bsrli_si128(a, imm) \
+ _mm_srli_si128((a), (imm))
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i __a, int __count)
@@ -1070,7 +1103,7 @@ _mm_loadu_si128(__m128i const *__p)
{
struct __loadu_si128 {
__m128i __v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
return ((struct __loadu_si128*)__p)->__v;
}
@@ -1284,27 +1317,21 @@ _mm_movemask_epi8(__m128i __a)
}
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
- _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
- __m128i __a = (a); \
- _Pragma("clang diagnostic pop"); \
- (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
+ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
+ (__v4si)_mm_set1_epi32(0), \
(imm) & 0x3, ((imm) & 0xc) >> 2, \
((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
- _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
- __m128i __a = (a); \
- _Pragma("clang diagnostic pop"); \
- (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
+ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+ (__v8hi)_mm_set1_epi16(0), \
(imm) & 0x3, ((imm) & 0xc) >> 2, \
((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
4, 5, 6, 7); })
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
- _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
- __m128i __a = (a); \
- _Pragma("clang diagnostic pop"); \
- (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
+ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+ (__v8hi)_mm_set1_epi16(0), \
0, 1, 2, 3, \
4 + (((imm) & 0x03) >> 0), \
4 + (((imm) & 0x0c) >> 2), \
@@ -1396,11 +1423,8 @@ _mm_movemask_pd(__m128d __a)
}
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
- _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
- __m128d __a = (a); \
- __m128d __b = (b); \
- _Pragma("clang diagnostic pop"); \
- __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
+ __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
+ (i) & 1, (((i) & 2) >> 1) + 2); })
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d __a)
diff --git a/lib/Headers/htmintrin.h b/lib/Headers/htmintrin.h
new file mode 100644
index 000000000000..0088c7ccab93
--- /dev/null
+++ b/lib/Headers/htmintrin.h
@@ -0,0 +1,226 @@
+/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMINTRIN_H
+#define __HTMINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#ifdef __powerpc__
+
+#include <stdint.h>
+
+typedef uint64_t texasr_t;
+typedef uint32_t texasru_t;
+typedef uint32_t texasrl_t;
+typedef uintptr_t tfiar_t;
+typedef uintptr_t tfhar_t;
+
+#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3)
+#define _HTM_NONTRANSACTIONAL 0x0
+#define _HTM_SUSPENDED 0x1
+#define _HTM_TRANSACTIONAL 0x2
+
+#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+ (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1))
+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+ (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
+
+#define _TEXASR_FAILURE_CODE(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 7, 8)
+#define _TEXASRU_FAILURE_CODE(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8)
+
+#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 7, 1)
+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
+
+#define _TEXASR_DISALLOWED(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 8, 1)
+#define _TEXASRU_DISALLOWED(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1)
+
+#define _TEXASR_NESTING_OVERFLOW(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 9, 1)
+#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1)
+
+#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 10, 1)
+#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1)
+
+#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 11, 1)
+#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1)
+
+#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 12, 1)
+#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1)
+
+#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 13, 1)
+#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1)
+
+#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 14, 1)
+#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1)
+
+#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 15, 1)
+#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1)
+
+#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 16, 1)
+#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1)
+
+#define _TEXASR_ABORT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 31, 1)
+#define _TEXASRU_ABORT(TEXASRU) \
+ _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1)
+
+
+#define _TEXASR_SUSPENDED(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 32, 1)
+
+#define _TEXASR_PRIVILEGE(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 35, 2)
+
+#define _TEXASR_FAILURE_SUMMARY(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 36, 1)
+
+#define _TEXASR_TFIAR_EXACT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 37, 1)
+
+#define _TEXASR_ROT(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 38, 1)
+
+#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \
+ _TEXASR_EXTRACT_BITS(TEXASR, 63, 12)
+
+#endif /* __powerpc */
+
+#ifdef __s390__
+
+/* Condition codes generated by tbegin */
+#define _HTM_TBEGIN_STARTED 0
+#define _HTM_TBEGIN_INDETERMINATE 1
+#define _HTM_TBEGIN_TRANSIENT 2
+#define _HTM_TBEGIN_PERSISTENT 3
+
+/* The abort codes below this threshold are reserved for machine use. */
+#define _HTM_FIRST_USER_ABORT_CODE 256
+
+/* The transaction diagnostic block is it is defined in the Principles
+ of Operation chapter 5-91. */
+
+struct __htm_tdb {
+ unsigned char format; /* 0 */
+ unsigned char flags;
+ unsigned char reserved1[4];
+ unsigned short nesting_depth;
+ unsigned long long abort_code; /* 8 */
+ unsigned long long conflict_token; /* 16 */
+ unsigned long long atia; /* 24 */
+ unsigned char eaid; /* 32 */
+ unsigned char dxc;
+ unsigned char reserved2[2];
+ unsigned int program_int_id;
+ unsigned long long exception_id; /* 40 */
+ unsigned long long bea; /* 48 */
+ unsigned char reserved3[72]; /* 56 */
+ unsigned long long gprs[16]; /* 128 */
+} __attribute__((__packed__, __aligned__ (8)));
+
+
+/* Helper intrinsics to retry tbegin in case of transient failure. */
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_null (int retry)
+{
+ int cc, i = 0;
+
+ while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
+ && i++ < retry)
+ __builtin_tx_assist(i);
+
+ return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_tdb (void *tdb, int retry)
+{
+ int cc, i = 0;
+
+ while ((cc = __builtin_tbegin(tdb)) == _HTM_TBEGIN_TRANSIENT
+ && i++ < retry)
+ __builtin_tx_assist(i);
+
+ return cc;
+}
+
+#define __builtin_tbegin_retry(tdb, retry) \
+ (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+ __builtin_tbegin_retry_null(retry) : \
+ __builtin_tbegin_retry_tdb(tdb, retry))
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_null (int retry)
+{
+ int cc, i = 0;
+
+ while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
+ && i++ < retry)
+ __builtin_tx_assist(i);
+
+ return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_tdb (void *tdb, int retry)
+{
+ int cc, i = 0;
+
+ while ((cc = __builtin_tbegin_nofloat(tdb)) == _HTM_TBEGIN_TRANSIENT
+ && i++ < retry)
+ __builtin_tx_assist(i);
+
+ return cc;
+}
+
+#define __builtin_tbegin_retry_nofloat(tdb, retry) \
+ (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+ __builtin_tbegin_retry_nofloat_null(retry) : \
+ __builtin_tbegin_retry_nofloat_tdb(tdb, retry))
+
+#endif /* __s390__ */
+
+#endif /* __HTMINTRIN_H */
diff --git a/lib/Headers/htmxlintrin.h b/lib/Headers/htmxlintrin.h
new file mode 100644
index 000000000000..30f524d5df49
--- /dev/null
+++ b/lib/Headers/htmxlintrin.h
@@ -0,0 +1,363 @@
+/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMXLINTRIN_H
+#define __HTMXLINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#include <htmintrin.h>
+
+#ifdef __powerpc__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _TEXASR_PTR(TM_BUF) \
+ ((texasr_t *)((TM_BUF)+0))
+#define _TEXASRU_PTR(TM_BUF) \
+ ((texasru_t *)((TM_BUF)+0))
+#define _TEXASRL_PTR(TM_BUF) \
+ ((texasrl_t *)((TM_BUF)+4))
+#define _TFIAR_PTR(TM_BUF) \
+ ((tfiar_t *)((TM_BUF)+8))
+
+typedef char TM_buff_type[16];
+
+/* This macro can be used to determine whether a transaction was successfully
+ started from the __TM_begin() and __TM_simple_begin() intrinsic functions
+ below. */
+#define _HTM_TBEGIN_STARTED 1
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_simple_begin (void)
+{
+ if (__builtin_expect (__builtin_tbegin (0), 1))
+ return _HTM_TBEGIN_STARTED;
+ return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_begin (void* const TM_buff)
+{
+ *_TEXASRL_PTR (TM_buff) = 0;
+ if (__builtin_expect (__builtin_tbegin (0), 1))
+ return _HTM_TBEGIN_STARTED;
+#ifdef __powerpc64__
+ *_TEXASR_PTR (TM_buff) = __builtin_get_texasr ();
+#else
+ *_TEXASRU_PTR (TM_buff) = __builtin_get_texasru ();
+ *_TEXASRL_PTR (TM_buff) = __builtin_get_texasr ();
+#endif
+ *_TFIAR_PTR (TM_buff) = __builtin_get_tfiar ();
+ return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_end (void)
+{
+ if (__builtin_expect (__builtin_tend (0), 1))
+ return 1;
+ return 0;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_abort (void)
+{
+ __builtin_tabort (0);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_named_abort (unsigned char const code)
+{
+ __builtin_tabort (code);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_resume (void)
+{
+ __builtin_tresume ();
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_suspend (void)
+{
+ __builtin_tsuspend ();
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_user_abort (void* const TM_buff)
+{
+ texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+ return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_named_user_abort (void* const TM_buff, unsigned char *code)
+{
+ texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+
+ *code = _TEXASRU_FAILURE_CODE (texasru);
+ return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_illegal (void* const TM_buff)
+{
+ texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+ return _TEXASRU_DISALLOWED (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_footprint_exceeded (void* const TM_buff)
+{
+ texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+ return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_nesting_depth (void* const TM_buff)
+{
+ texasrl_t texasrl;
+
+ if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
+ {
+ texasrl = *_TEXASRL_PTR (TM_buff);
+ if (!_TEXASR_FAILURE_SUMMARY (texasrl))
+ texasrl = 0;
+ }
+ else
+ texasrl = (texasrl_t) __builtin_get_texasr ();
+
+ return _TEXASR_TRANSACTION_LEVEL (texasrl);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_nested_too_deep(void* const TM_buff)
+{
+ texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+ return _TEXASRU_NESTING_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_conflict(void* const TM_buff)
+{
+ texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+ /* Return TEXASR bits 11 (Self-Induced Conflict) through
+ 14 (Translation Invalidation Conflict). */
+ return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_failure_persistent(void* const TM_buff)
+{
+ texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+ return _TEXASRU_FAILURE_PERSISTENT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_address(void* const TM_buff)
+{
+ return *_TFIAR_PTR (TM_buff);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_code(void* const TM_buff)
+{
+ return *_TEXASR_PTR (TM_buff);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __powerpc__ */
+
+#ifdef __s390__
+
+#include <stdint.h>
+
+/* These intrinsics are being made available for compatibility with
+ the IBM XL compiler. For documentation please see the "z/OS XL
+ C/C++ Programming Guide" publically available on the web. */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_simple_begin ()
+{
+ return __builtin_tbegin_nofloat (0);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_begin (void* const tdb)
+{
+ return __builtin_tbegin_nofloat (tdb);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_end ()
+{
+ return __builtin_tend ();
+}
+
+static __inline void __attribute__((__always_inline__))
+__TM_abort ()
+{
+ return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_named_abort (unsigned char const code)
+{
+ return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + code);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_non_transactional_store (void* const addr, long long const value)
+{
+ __builtin_non_tx_store ((uint64_t*)addr, (uint64_t)value);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_nesting_depth (void* const tdb_ptr)
+{
+ int depth = __builtin_tx_nesting_depth ();
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+ if (depth != 0)
+ return depth;
+
+ if (tdb->format != 1)
+ return 0;
+ return tdb->nesting_depth;
+}
+
+/* Transaction failure diagnostics */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_user_abort (void* const tdb_ptr)
+{
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+ if (tdb->format != 1)
+ return 0;
+
+ return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_named_user_abort (void* const tdb_ptr, unsigned char* code)
+{
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+ if (tdb->format != 1)
+ return 0;
+
+ if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
+ {
+ *code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
+ return 1;
+ }
+ return 0;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_illegal (void* const tdb_ptr)
+{
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+ return (tdb->format == 1
+ && (tdb->abort_code == 4 /* unfiltered program interruption */
+ || tdb->abort_code == 11 /* restricted instruction */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_footprint_exceeded (void* const tdb_ptr)
+{
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+ return (tdb->format == 1
+ && (tdb->abort_code == 7 /* fetch overflow */
+ || tdb->abort_code == 8 /* store overflow */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_nested_too_deep (void* const tdb_ptr)
+{
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+ return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_conflict (void* const tdb_ptr)
+{
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+ return (tdb->format == 1
+ && (tdb->abort_code == 9 /* fetch conflict */
+ || tdb->abort_code == 10 /* store conflict */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_failure_persistent (long const result)
+{
+ return result == _HTM_TBEGIN_PERSISTENT;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_address (void* const tdb_ptr)
+{
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+ return tdb->atia;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_code (void* const tdb_ptr)
+{
+ struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+ return tdb->abort_code;
+}
+
+#endif /* __s390__ */
+
+#endif /* __HTMXLINTRIN_H */
diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h
index 2400fea499bd..ac7d54a41db8 100644
--- a/lib/Headers/immintrin.h
+++ b/lib/Headers/immintrin.h
@@ -88,10 +88,18 @@
#include <avx512bwintrin.h>
#endif
+#ifdef __AVX512DQ__
+#include <avx512dqintrin.h>
+#endif
+
#if defined (__AVX512VL__) && defined (__AVX512BW__)
#include <avx512vlbwintrin.h>
#endif
+#if defined (__AVX512VL__) && defined (__AVX512DQ__)
+#include <avx512vldqintrin.h>
+#endif
+
#ifdef __AVX512ER__
#include <avx512erintrin.h>
#endif
diff --git a/lib/Headers/module.modulemap b/lib/Headers/module.modulemap
index 062464ed2e53..ac5876f8d726 100644
--- a/lib/Headers/module.modulemap
+++ b/lib/Headers/module.modulemap
@@ -49,7 +49,7 @@ module _Builtin_intrinsics [system] [extern_c] {
explicit module sse {
requires sse
export mmx
- export * // note: for hackish <emmintrin.h> dependency
+ export sse2 // note: for hackish <emmintrin.h> dependency
header "xmmintrin.h"
}
@@ -169,6 +169,19 @@ module _Builtin_intrinsics [system] [extern_c] {
header "__wmmintrin_pclmul.h"
}
}
+
+ explicit module systemz {
+ requires systemz
+ export *
+
+ header "s390intrin.h"
+
+ explicit module htm {
+ requires htm
+ header "htmintrin.h"
+ header "htmxlintrin.h"
+ }
+ }
}
module _Builtin_stddef_max_align_t [system] [extern_c] {
diff --git a/lib/Headers/s390intrin.h b/lib/Headers/s390intrin.h
new file mode 100644
index 000000000000..b20989552d4d
--- /dev/null
+++ b/lib/Headers/s390intrin.h
@@ -0,0 +1,35 @@
+/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __S390INTRIN_H
+#define __S390INTRIN_H
+
+#ifndef __s390__
+#error "<s390intrin.h> is for s390 only"
+#endif
+
+#ifdef __HTM__
+#include <htmintrin.h>
+#endif
+
+#endif /* __S390INTRIN_H*/
diff --git a/lib/Headers/stdatomic.h b/lib/Headers/stdatomic.h
index e3c34767a21a..e03798766014 100644
--- a/lib/Headers/stdatomic.h
+++ b/lib/Headers/stdatomic.h
@@ -71,7 +71,7 @@ typedef enum memory_order {
/* 7.17.4 Fences */
-// These should be provided by the libc implementation.
+/* These should be provided by the libc implementation. */
void atomic_thread_fence(memory_order);
void atomic_signal_fence(memory_order);
@@ -164,7 +164,7 @@ typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
#define ATOMIC_FLAG_INIT { 0 }
-// These should be provided by the libc implementation.
+/* These should be provided by the libc implementation. */
#ifdef __cplusplus
bool atomic_flag_test_and_set(volatile atomic_flag *);
bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
diff --git a/lib/Headers/unwind.h b/lib/Headers/unwind.h
index 90aca16aca39..303d79288aee 100644
--- a/lib/Headers/unwind.h
+++ b/lib/Headers/unwind.h
@@ -235,9 +235,9 @@ void *_Unwind_FindEnclosingFunction(void *);
#ifdef __APPLE__
_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *)
- __attribute__((unavailable));
+ __attribute__((__unavailable__));
_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *)
- __attribute__((unavailable));
+ __attribute__((__unavailable__));
/* Darwin-specific functions */
void __register_frame(const void *);
@@ -251,15 +251,15 @@ struct dwarf_eh_bases {
void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *);
void __register_frame_info_bases(const void *, void *, void *, void *)
- __attribute__((unavailable));
-void __register_frame_info(const void *, void *) __attribute__((unavailable));
+ __attribute__((__unavailable__));
+void __register_frame_info(const void *, void *) __attribute__((__unavailable__));
void __register_frame_info_table_bases(const void *, void*, void *, void *)
- __attribute__((unavailable));
+ __attribute__((__unavailable__));
void __register_frame_info_table(const void *, void *)
- __attribute__((unavailable));
-void __register_frame_table(const void *) __attribute__((unavailable));
-void __deregister_frame_info(const void *) __attribute__((unavailable));
-void __deregister_frame_info_bases(const void *)__attribute__((unavailable));
+ __attribute__((__unavailable__));
+void __register_frame_table(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info_bases(const void *)__attribute__((__unavailable__));
#else
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index d1afe81601c3..3a6b95e8bfea 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -994,7 +994,7 @@ do { \
#define _m_ _mm_
/* Ugly hack for backwards-compatibility (compatible with gcc) */
-#ifdef __SSE2__
+#if defined(__SSE2__) && !__has_feature(modules)
#include <emmintrin.h>
#endif