aboutsummaryrefslogtreecommitdiff
path: root/lib/Headers/avx512fintrin.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Headers/avx512fintrin.h')
-rw-r--r--lib/Headers/avx512fintrin.h2346
1 files changed, 1524 insertions, 822 deletions
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index 0bf6582345d4..e6a7217c8967 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -54,6 +54,19 @@ typedef unsigned short __mmask16;
#define _MM_FROUND_TO_ZERO 0x03
#define _MM_FROUND_CUR_DIRECTION 0x04
+/* Constants for integer comparison predicates */
+typedef enum {
+ _MM_CMPINT_EQ, /* Equal */
+ _MM_CMPINT_LT, /* Less than */
+ _MM_CMPINT_LE, /* Less than or Equal */
+ _MM_CMPINT_UNUSED,
+ _MM_CMPINT_NE, /* Not Equal */
+ _MM_CMPINT_NLT, /* Not Less than */
+#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */
+ _MM_CMPINT_NLE /* Not Less than or Equal */
+#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */
+} _MM_CMPINT_ENUM;
+
typedef enum
{
_MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
@@ -503,6 +516,18 @@ _mm512_castsi512_si256 (__m512i __A)
return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
}
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_int2mask(int __a)
+{
+ return (__mmask16)__a;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask2int(__mmask16 __a)
+{
+ return (int)__a;
+}
+
/* Bitwise operators */
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_and_epi32(__m512i __a, __m512i __b)
@@ -737,22 +762,19 @@ _mm512_add_epi64 (__m512i __A, __m512i __B)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_add_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_add_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -762,22 +784,19 @@ _mm512_sub_epi64 (__m512i __A, __m512i __B)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sub_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sub_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -787,22 +806,19 @@ _mm512_add_epi32 (__m512i __A, __m512i __B)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_add_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_add_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -812,22 +828,19 @@ _mm512_sub_epi32 (__m512i __A, __m512i __B)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sub_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sub_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
@@ -1403,57 +1416,45 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_mul_epi32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di) __W, __M);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epi32(__X, __Y),
+ (__v8di)__W);
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- __M);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epi32(__X, __Y),
+ (__v8di)_mm512_setzero_si512 ());
}
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_mul_epu32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di) __W, __M);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epu32(__X, __Y),
+ (__v8di)__W);
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- __M);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epu32(__X, __Y),
+ (__v8di)_mm512_setzero_si512 ());
}
static __inline __m512i __DEFAULT_FN_ATTRS
@@ -1463,21 +1464,19 @@ _mm512_mullo_epi32 (__m512i __A, __m512i __B)
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- __M);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_mullo_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si) __W, __M);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_mullo_epi32(__A, __B),
+ (__v16si)__W);
}
#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
@@ -1977,38 +1976,30 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_add_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_add_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_add_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_add_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
#define _mm512_add_round_pd(A, B, R) __extension__ ({ \
@@ -2120,40 +2111,30 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_sub_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_sub_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_sub_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_sub_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
@@ -2265,40 +2246,30 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_mul_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_mul_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_mul_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_mul_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
@@ -2417,21 +2388,16 @@ _mm512_div_pd(__m512d __a, __m512d __b)
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_div_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_div_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline __m512 __DEFAULT_FN_ATTRS
@@ -2442,21 +2408,16 @@ _mm512_div_ps(__m512 __a, __m512 __b)
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_div_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_div_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
#define _mm512_div_round_pd(A, B, R) __extension__ ({ \
@@ -3443,71 +3404,94 @@ _mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
}
#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
- (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(I), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+ (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
+ (__v8di)(__m512i)(A), \
+ ((int)(I) & 0x7) + 0, \
+ ((int)(I) & 0x7) + 1, \
+ ((int)(I) & 0x7) + 2, \
+ ((int)(I) & 0x7) + 3, \
+ ((int)(I) & 0x7) + 4, \
+ ((int)(I) & 0x7) + 5, \
+ ((int)(I) & 0x7) + 6, \
+ ((int)(I) & 0x7) + 7); })
#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
- (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(imm), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+ (__v8di)(__m512i)(W)); })
#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
- (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(imm), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()); })
#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
- (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(I), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+ (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
+ (__v16si)(__m512i)(A), \
+ ((int)(I) & 0xf) + 0, \
+ ((int)(I) & 0xf) + 1, \
+ ((int)(I) & 0xf) + 2, \
+ ((int)(I) & 0xf) + 3, \
+ ((int)(I) & 0xf) + 4, \
+ ((int)(I) & 0xf) + 5, \
+ ((int)(I) & 0xf) + 6, \
+ ((int)(I) & 0xf) + 7, \
+ ((int)(I) & 0xf) + 8, \
+ ((int)(I) & 0xf) + 9, \
+ ((int)(I) & 0xf) + 10, \
+ ((int)(I) & 0xf) + 11, \
+ ((int)(I) & 0xf) + 12, \
+ ((int)(I) & 0xf) + 13, \
+ ((int)(I) & 0xf) + 14, \
+ ((int)(I) & 0xf) + 15); })
#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
- (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(imm), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+ (__v16si)(__m512i)(W)); })
#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
- (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(imm), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()); })
/* Vector Extract */
-#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \
- (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
- (__v4df)_mm256_setzero_si256(), \
- (__mmask8)-1); })
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \
+ (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_undefined_pd(), \
+ ((I) & 1) ? 4 : 0, \
+ ((I) & 1) ? 5 : 1, \
+ ((I) & 1) ? 6 : 2, \
+ ((I) & 1) ? 7 : 3); })
#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
- (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__v4df)(__m256d)(W), \
- (__mmask8)(U)); })
+ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+ (__v4df)(W)); })
#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
- (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)(U)); })
+ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+ (__v4df)_mm256_setzero_pd()); })
-#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \
- (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1); })
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \
+ (__m128)__builtin_shufflevector((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_undefined_ps(), \
+ 0 + ((I) & 0x3) * 4, \
+ 1 + ((I) & 0x3) * 4, \
+ 2 + ((I) & 0x3) * 4, \
+ 3 + ((I) & 0x3) * 4); })
#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
- (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U)); })
+ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+ (__v4sf)(W)); })
#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
- (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)); })
+ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+ (__v4sf)_mm_setzero_ps()); })
+
/* Vector Blend */
static __inline __m512d __DEFAULT_FN_ATTRS
@@ -3556,10 +3540,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
#define _mm512_cmp_ps_mask(A, B, P) \
_mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
_mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_cmpeq_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
+
#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
(__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(P), \
@@ -3572,10 +3595,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
#define _mm512_cmp_pd_mask(A, B, P) \
_mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
_mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_cmpeq_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
+
/* Conversion */
#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
@@ -3682,26 +3744,35 @@ _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
static __inline __m512d __DEFAULT_FN_ATTRS
_mm512_cvtepi32_pd(__m256i __A)
{
- return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) -1);
+ return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
{
- return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
- (__v8df) __W,
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepi32_pd(__A),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
{
- return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
- (__v8df) _mm512_setzero_pd (),
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepi32_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32lo_pd(__m512i __A)
+{
+ return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+ return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -3734,26 +3805,35 @@ _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
static __inline __m512d __DEFAULT_FN_ATTRS
_mm512_cvtepu32_pd(__m256i __A)
{
- return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) -1);
+ return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
{
- return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
- (__v8df) __W,
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepu32_pd(__A),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
{
- return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
- (__v8df) _mm512_setzero_pd (),
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepu32_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32lo_pd(__m512i __A)
+{
+ return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+ return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
}
#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
@@ -3798,6 +3878,24 @@ _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
_MM_FROUND_CUR_DIRECTION);
}
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpd_pslo (__m512d __A)
+{
+ return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
+ (__v8sf) _mm256_setzero_ps (),
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
+{
+ return (__m512) __builtin_shufflevector (
+ (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
+ __U, __A),
+ (__v8sf) _mm256_setzero_ps (),
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
(__v16hi)_mm256_undefined_si256(), \
@@ -4919,263 +5017,227 @@ _mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi32 (__m128i __A)
+_mm512_cvtepi8_epi32(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi8_epi32(__A),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
+_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi8_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi64 (__m128i __A)
+_mm512_cvtepi8_epi64(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi8_epi64(__A),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi8_epi64(__A),
+ (__v8di)_mm512_setzero_si512 ());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi32_epi64 (__m256i __X)
+_mm512_cvtepi32_epi64(__m256i __X)
{
- return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
{
- return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi32_epi64(__X),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
+_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
{
- return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi32_epi64(__X),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi16_epi32 (__m256i __A)
+_mm512_cvtepi16_epi32(__m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi16_epi32(__A),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
+_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi16_epi32(__A),
+ (__v16si)_mm512_setzero_si512 ());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi16_epi64 (__m128i __A)
+_mm512_cvtepi16_epi64(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi16_epi64(__A),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi16_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi32 (__m128i __A)
+_mm512_cvtepu8_epi32(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu8_epi32(__A),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
+_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu8_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi64 (__m128i __A)
+_mm512_cvtepu8_epi64(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu8_epi64(__A),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu8_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu32_epi64 (__m256i __X)
+_mm512_cvtepu32_epi64(__m256i __X)
{
- return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
{
- return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu32_epi64(__X),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
+_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
{
- return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu32_epi64(__X),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu16_epi32 (__m256i __A)
+_mm512_cvtepu16_epi32(__m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu16_epi32(__A),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
+_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu16_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu16_epi64 (__m128i __A)
+_mm512_cvtepu16_epi64(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu16_epi64(__A),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu16_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -5393,67 +5455,91 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U)); })
-#define _mm512_slli_epi32(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
-
-#define _mm512_mask_slli_epi32(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
-
-#define _mm512_maskz_slli_epi32(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi32(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+}
-#define _mm512_slli_epi64(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_slli_epi32(__A, __B),
+ (__v16si)__W);
+}
-#define _mm512_mask_slli_epi64(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_slli_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
-#define _mm512_maskz_slli_epi64(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi64(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_slli_epi64(__A, __B),
+ (__v8di)__W);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_slli_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
-#define _mm512_srli_epi32(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi32(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+}
-#define _mm512_mask_srli_epi32(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srli_epi32(__A, __B),
+ (__v16si)__W);
+}
-#define _mm512_maskz_srli_epi32(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srli_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
-#define _mm512_srli_epi64(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi64(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+}
-#define _mm512_mask_srli_epi64(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srli_epi64(__A, __B),
+ (__v8di)__W);
+}
-#define _mm512_maskz_srli_epi64(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srli_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
@@ -5911,8 +5997,10 @@ _mm512_kmov (__mmask16 __A)
(int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
(int)(P), (int)(R)); })
+#ifdef __x86_64__
#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
@@ -5926,351 +6014,267 @@ _mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi32 (__m512i __A, __m128i __B)
+_mm512_sll_epi32(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sll_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sll_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi64 (__m512i __A, __m128i __B)
+_mm512_sll_epi64(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sll_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sll_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+_mm512_sllv_epi32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sllv_epi32(__X, __Y),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sllv_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+_mm512_sllv_epi64(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_undefined_pd (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sllv_epi64(__X, __Y),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sllv_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi32 (__m512i __A, __m128i __B)
+_mm512_sra_epi32(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sra_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sra_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi64 (__m512i __A, __m128i __B)
+_mm512_sra_epi64(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sra_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sra_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+_mm512_srav_epi32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srav_epi32(__X, __Y),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srav_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+_mm512_srav_epi64(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srav_epi64(__X, __Y),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srav_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi32 (__m512i __A, __m128i __B)
+_mm512_srl_epi32(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srl_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srl_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi64 (__m512i __A, __m128i __B)
+_mm512_srl_epi64(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srl_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srl_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+_mm512_srlv_epi32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srlv_epi32(__X, __Y),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srlv_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srlv_epi64(__X, __Y),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srlv_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
}
#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
@@ -6309,8 +6313,10 @@ _mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
(__v8di)(__m512i)(C), (int)(imm), \
(__mmask8)(U)); })
+#ifdef __x86_64__
#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6328,6 +6334,7 @@ _mm_cvtsd_u32 (__m128d __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
(unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
(int)(R)); })
@@ -6339,6 +6346,7 @@ _mm_cvtsd_u64 (__m128d __A)
__A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6346,11 +6354,13 @@ _mm_cvtsd_u64 (__m128d __A)
#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+#ifdef __x86_64__
#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#endif
#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
(unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6362,6 +6372,7 @@ _mm_cvtss_u32 (__m128 __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
(unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
(int)(R)); })
@@ -6373,6 +6384,7 @@ _mm_cvtss_u64 (__m128 __A)
__A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6387,6 +6399,7 @@ _mm_cvttsd_i32 (__m128d __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
@@ -6399,6 +6412,7 @@ _mm_cvttsd_i64 (__m128d __A)
return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
(unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6410,6 +6424,7 @@ _mm_cvttsd_u32 (__m128d __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
(unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
(int)(R)); })
@@ -6421,6 +6436,7 @@ _mm_cvttsd_u64 (__m128d __A)
__A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6435,6 +6451,7 @@ _mm_cvttss_i32 (__m128 __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
@@ -6447,6 +6464,7 @@ _mm_cvttss_i64 (__m128 __A)
return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
(unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6458,6 +6476,7 @@ _mm_cvttss_u32 (__m128 __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
(unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
(int)(R)); })
@@ -6469,6 +6488,7 @@ _mm_cvttss_u64 (__m128 __A)
__A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
@@ -6556,61 +6576,47 @@ _mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
(__v16sf)_mm512_setzero_ps()); })
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_permutevar_pd (__m512d __A, __m512i __C)
+_mm512_permutevar_pd(__m512d __A, __m512i __C)
{
- return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
- (__v8di) __C,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1);
+ return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
{
- return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
- (__v8di) __C,
- (__v8df) __W,
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_permutevar_pd(__A, __C),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
{
- return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
- (__v8di) __C,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_permutevar_pd(__A, __C),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_permutevar_ps (__m512 __A, __m512i __C)
+_mm512_permutevar_ps(__m512 __A, __m512i __C)
{
- return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
- (__v16si) __C,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1);
+ return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
{
- return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
- (__v16si) __C,
- (__v16sf) __W,
- (__mmask16) __U);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_permutevar_ps(__A, __C),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
{
- return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
- (__v16si) __C,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_permutevar_ps(__A, __C),
+ (__v16sf)_mm512_setzero_ps());
}
static __inline __m512d __DEFAULT_FN_ATTRS
@@ -7028,35 +7034,48 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
(__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION); })
-#define _mm512_srai_epi32(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi32(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+}
-#define _mm512_mask_srai_epi32(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+ (__v16si)_mm512_srai_epi32(__A, __B), \
+ (__v16si)__W);
+}
-#define _mm512_maskz_srai_epi32(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+ (__v16si)_mm512_srai_epi32(__A, __B), \
+ (__v16si)_mm512_setzero_si512());
+}
-#define _mm512_srai_epi64(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi64(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+}
-#define _mm512_mask_srai_epi64(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+ (__v8di)_mm512_srai_epi64(__A, __B), \
+ (__v8di)__W);
+}
-#define _mm512_maskz_srai_epi64(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+ (__v8di)_mm512_srai_epi64(__A, __B), \
+ (__v8di)_mm512_setzero_si512());
+}
#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
(__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
@@ -7832,107 +7851,145 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
__builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
}
-#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)_mm_undefined_si128(), \
- (__mmask8)-1); })
+#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
+ (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+ (__v16si)_mm512_undefined_epi32(), \
+ 0 + ((imm) & 0x3) * 4, \
+ 1 + ((imm) & 0x3) * 4, \
+ 2 + ((imm) & 0x3) * 4, \
+ 3 + ((imm) & 0x3) * 4); })
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+ (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+ (__v4si)__W); })
#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+ (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+ (__v4si)_mm_setzero_si128()); })
-#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)_mm256_undefined_si256(), \
- (__mmask8)-1); })
+#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
+ (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+ (__v8di)_mm512_undefined_epi32(), \
+ ((imm) & 1) ? 4 : 0, \
+ ((imm) & 1) ? 5 : 1, \
+ ((imm) & 1) ? 6 : 2, \
+ ((imm) & 1) ? 7 : 3); })
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)(__m256i)(W), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+ (__v4di)__W); })
#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+ (__v4di)_mm256_setzero_si256()); })
#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
- (__v4df)(__m256d)(B), (int)(imm), \
- (__v8df)_mm512_undefined_pd(), \
- (__mmask8)-1); })
+ (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
+ ((imm) & 0x1) ? 0 : 8, \
+ ((imm) & 0x1) ? 1 : 9, \
+ ((imm) & 0x1) ? 2 : 10, \
+ ((imm) & 0x1) ? 3 : 11, \
+ ((imm) & 0x1) ? 8 : 4, \
+ ((imm) & 0x1) ? 9 : 5, \
+ ((imm) & 0x1) ? 10 : 6, \
+ ((imm) & 0x1) ? 11 : 7); })
#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
- (__v4df)(__m256d)(B), (int)(imm), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U)); })
+ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+ (__v8df)(W)); })
#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
- (__v4df)(__m256d)(B), (int)(imm), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U)); })
+ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+ (__v8df)_mm512_setzero_pd()); })
#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+ (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+ (__v8di)_mm512_castsi256_si512((__m256i)(B)), \
+ ((imm) & 0x1) ? 0 : 8, \
+ ((imm) & 0x1) ? 1 : 9, \
+ ((imm) & 0x1) ? 2 : 10, \
+ ((imm) & 0x1) ? 3 : 11, \
+ ((imm) & 0x1) ? 8 : 4, \
+ ((imm) & 0x1) ? 9 : 5, \
+ ((imm) & 0x1) ? 10 : 6, \
+ ((imm) & 0x1) ? 11 : 7); })
#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+ (__v8di)(W)); })
#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()); })
#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1); })
+ (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_castps128_ps512((__m128)(B)),\
+ (((imm) & 0x3) == 0) ? 16 : 0, \
+ (((imm) & 0x3) == 0) ? 17 : 1, \
+ (((imm) & 0x3) == 0) ? 18 : 2, \
+ (((imm) & 0x3) == 0) ? 19 : 3, \
+ (((imm) & 0x3) == 1) ? 16 : 4, \
+ (((imm) & 0x3) == 1) ? 17 : 5, \
+ (((imm) & 0x3) == 1) ? 18 : 6, \
+ (((imm) & 0x3) == 1) ? 19 : 7, \
+ (((imm) & 0x3) == 2) ? 16 : 8, \
+ (((imm) & 0x3) == 2) ? 17 : 9, \
+ (((imm) & 0x3) == 2) ? 18 : 10, \
+ (((imm) & 0x3) == 2) ? 19 : 11, \
+ (((imm) & 0x3) == 3) ? 16 : 12, \
+ (((imm) & 0x3) == 3) ? 17 : 13, \
+ (((imm) & 0x3) == 3) ? 18 : 14, \
+ (((imm) & 0x3) == 3) ? 19 : 15); })
#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U)); })
+ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+ (__v16sf)(W)); })
#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U)); })
+ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+ (__v16sf)_mm512_setzero_ps()); })
#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+ (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+ (__v16si)_mm512_castsi128_si512((__m128i)(B)),\
+ (((imm) & 0x3) == 0) ? 16 : 0, \
+ (((imm) & 0x3) == 0) ? 17 : 1, \
+ (((imm) & 0x3) == 0) ? 18 : 2, \
+ (((imm) & 0x3) == 0) ? 19 : 3, \
+ (((imm) & 0x3) == 1) ? 16 : 4, \
+ (((imm) & 0x3) == 1) ? 17 : 5, \
+ (((imm) & 0x3) == 1) ? 18 : 6, \
+ (((imm) & 0x3) == 1) ? 19 : 7, \
+ (((imm) & 0x3) == 2) ? 16 : 8, \
+ (((imm) & 0x3) == 2) ? 17 : 9, \
+ (((imm) & 0x3) == 2) ? 18 : 10, \
+ (((imm) & 0x3) == 2) ? 19 : 11, \
+ (((imm) & 0x3) == 3) ? 16 : 12, \
+ (((imm) & 0x3) == 3) ? 17 : 13, \
+ (((imm) & 0x3) == 3) ? 18 : 14, \
+ (((imm) & 0x3) == 3) ? 19 : 15); })
#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+ (__v16si)(W)); })
#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()); })
#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
@@ -8275,17 +8332,17 @@ __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ (__v4sf) __A,
(__v4sf) __B,
- (__v4sf) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
- (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ (__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8323,17 +8380,17 @@ _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ (__v4sf) __A,
-(__v4sf) __B,
- (__v4sf) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
- (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ (__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8355,33 +8412,33 @@ _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
(__v4sf) __X,
- -(__v4sf) __Y,
+ (__v4sf) __Y,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
- (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+ (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
(__v4sf)(__m128)(X), \
- -(__v4sf)(__m128)(Y), (__mmask8)(U), \
+ (__v4sf)(__m128)(Y), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ -(__v4sf) __A,
(__v4sf) __B,
- (__v4sf) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
- (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ -(__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8419,17 +8476,17 @@ _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ -(__v4sf) __A,
-(__v4sf) __B,
- (__v4sf) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
- (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ -(__v4sf)(__m128)(A), \
+ -(__v4sf)(__m128)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8451,33 +8508,33 @@ _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
+ return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W,
(__v4sf) __X,
- -(__v4sf) __Y,
+ (__v4sf) __Y,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
- (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
+ (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \
(__v4sf)(__m128)(X), \
- -(__v4sf)(__m128)(Y), (__mmask8)(U), \
+ (__v4sf)(__m128)(Y), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+ (__v2df) __A,
(__v2df) __B,
- (__v2df) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ (__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8515,17 +8572,17 @@ _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+ (__v2df) __A,
-(__v2df) __B,
- (__v2df) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ (__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8547,33 +8604,33 @@ _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
(__v2df) __X,
- -(__v2df) __Y,
+ (__v2df) __Y,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+ (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
(__v2df)(__m128d)(X), \
- -(__v2df)(__m128d)(Y), \
+ (__v2df)(__m128d)(Y), \
(__mmask8)(U), (int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+ -(__v2df) __A,
(__v2df) __B,
- (__v2df) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ -(__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8611,17 +8668,17 @@ _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+ -(__v2df) __A,
-(__v2df) __B,
- (__v2df) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ -(__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8644,17 +8701,17 @@ _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) (__W),
+ return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W),
(__v2df) __X,
- -(__v2df) (__Y),
+ (__v2df) (__Y),
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
+ (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \
(__v2df)(__m128d)(X), \
- -(__v2df)(__m128d)(Y), \
+ (__v2df)(__m128d)(Y), \
(__mmask8)(U), (int)(R)); })
#define _mm512_permutex_pd(X, C) __extension__ ({ \
@@ -9041,6 +9098,101 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
(__v16sf)_mm512_setzero_ps());
}
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ __m128 res = __A;
+ res[0] = (__U & 1) ? __B[0] : __W[0];
+ return res;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ __m128 res = __A;
+ res[0] = (__U & 1) ? __B[0] : 0;
+ return res;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ __m128d res = __A;
+ res[0] = (__U & 1) ? __B[0] : __W[0];
+ return res;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ __m128d res = __A;
+ res[0] = (__U & 1) ? __B[0] : 0;
+ return res;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_storess128_mask ((__v16sf *)__W,
+ (__v16sf) _mm512_castps128_ps512(__A),
+ (__mmask16) __U & (__mmask16)1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_storesd128_mask ((__v8df *)__W,
+ (__v8df) _mm512_castpd128_pd512(__A),
+ (__mmask8) __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
+{
+ __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
+ (__v4sf) {0.0, 0.0, 0.0, 0.0},
+ 0, 4, 4, 4);
+
+ return (__m128) __builtin_shufflevector(
+ __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+ (__v16sf) _mm512_castps128_ps512(src),
+ (__mmask16) __U & 1),
+ _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_load_ss (__mmask8 __U, const float* __A)
+{
+ return (__m128) __builtin_shufflevector(
+ __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+ (__v16sf) _mm512_setzero_ps(),
+ (__mmask16) __U & 1),
+ _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
+{
+ __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
+ (__v2df) {0.0, 0.0}, 0, 2);
+
+ return (__m128d) __builtin_shufflevector(
+ __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+ (__v8df) _mm512_castpd128_pd512(src),
+ (__mmask8) __U & 1),
+ _mm512_undefined_pd(), 0, 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_load_sd (__mmask8 __U, const double* __A)
+{
+ return (__m128d) __builtin_shufflevector(
+ __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+ (__v8df) _mm512_setzero_pd(),
+ (__mmask8) __U & 1),
+ _mm512_undefined_pd(), 0, 1);
+}
+
#define _mm512_shuffle_epi32(A, I) __extension__ ({ \
(__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
(__v16si)_mm512_undefined_epi32(), \
@@ -9243,6 +9395,18 @@ _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
_MM_FROUND_CUR_DIRECTION);
}
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpslo_pd (__m512 __A)
+{
+ return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
+{
+ return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
+}
+
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
@@ -9340,14 +9504,17 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
}
#define _mm_cvtss_i32 _mm_cvtss_si32
-#define _mm_cvtss_i64 _mm_cvtss_si64
#define _mm_cvtsd_i32 _mm_cvtsd_si32
-#define _mm_cvtsd_i64 _mm_cvtsd_si64
#define _mm_cvti32_sd _mm_cvtsi32_sd
-#define _mm_cvti64_sd _mm_cvtsi64_sd
#define _mm_cvti32_ss _mm_cvtsi32_ss
+#ifdef __x86_64__
+#define _mm_cvtss_i64 _mm_cvtss_si64
+#define _mm_cvtsd_i64 _mm_cvtsd_si64
+#define _mm_cvti64_sd _mm_cvtsi64_sd
#define _mm_cvti64_ss _mm_cvtsi64_ss
+#endif
+#ifdef __x86_64__
#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
(int)(R)); })
@@ -9355,6 +9522,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
(int)(R)); })
+#endif
#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
@@ -9362,6 +9530,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+#ifdef __x86_64__
#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
(int)(R)); })
@@ -9369,6 +9538,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
(int)(R)); })
+#endif
#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
@@ -9412,6 +9582,7 @@ _mm_cvtu32_sd (__m128d __A, unsigned __B)
return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
}
+#ifdef __x86_64__
#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
(unsigned long long)(B), (int)(R)); })
@@ -9422,6 +9593,7 @@ _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
@@ -9434,6 +9606,7 @@ _mm_cvtu32_ss (__m128 __A, unsigned __B)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
(unsigned long long)(B), (int)(R)); })
@@ -9444,6 +9617,7 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
@@ -9452,12 +9626,14 @@ _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
__M);
}
+#ifdef __x86_64__
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
{
return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
__M);
}
+#endif
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_set_epi32 (int __A, int __B, int __C, int __D,
@@ -9514,27 +9690,553 @@ _mm512_set_ps (float __A, float __B, float __C, float __D,
(e4),(e3),(e2),(e1),(e0))
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_abs_ps(__m512 A)
+_mm512_abs_ps(__m512 __A)
{
- return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+ return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_abs_ps(__m512 W, __mmask16 K, __m512 A)
+_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
{
- return (__m512)_mm512_mask_and_epi32((__m512i)W, K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+ return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_abs_pd(__m512d A)
+_mm512_abs_pd(__m512d __A)
{
- return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A) ;
+ return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_abs_pd(__m512d W, __mmask8 K, __m512d A)
-{
- return (__m512d)_mm512_mask_and_epi64((__v8di)W, K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A);
+_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
+{
+ return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
+}
+
+// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
+// outputs. This class of vector operation forms the basis of many scientific
+// computations. In vector-reduction arithmetic, the evaluation off is
+// independent of the order of the input elements of V.
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2 - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+
+#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \
+ __extension__({ \
+ __m256##T1 Vec256 = __builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, 1, 2, 3) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 4, 5, 6, 7); \
+ __m128##T1 Vec128 = __builtin_shufflevector( \
+ (__v4d##T2)Vec256, \
+ (__v4d##T2)Vec256, \
+ 0, 1) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v4d##T2)Vec256, \
+ (__v4d##T2)Vec256, \
+ 2, 3); \
+ Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \
+ (__v2d##T2)Vec128, 0, -1) \
+ Operator \
+ __builtin_shufflevector((__v2d##T2)Vec128, \
+ (__v2d##T2)Vec128, 1, -1); \
+ return Vec128[0]; \
+ })
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) {
+ _mm512_reduce_operator_64bit(__W, +, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) {
+ _mm512_reduce_operator_64bit(__W, *, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) {
+ _mm512_reduce_operator_64bit(__W, &, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) {
+ _mm512_reduce_operator_64bit(__W, |, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
+ _mm512_reduce_operator_64bit(__W, +, f, d);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
+ _mm512_reduce_operator_64bit(__W, *, f, d);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element.
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2 - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for packed double-precision.
+// T3 - Can be Pd for packed double or q for q-word.
+
+#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \
+ Mask, T2, T1, T3) \
+ __extension__({ \
+ Vec512 = __builtin_ia32_select##T3##_512( \
+ (__mmask8)Mask, \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512Neutral); \
+ _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \
+ })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+ &, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M,
+ i, i, q);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
+ f, d, pd);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M,
+ f, d, pd);
+}
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2 - Can get 'i' for int and ' ' for packed single.
+// T1 - Can get 'i' for int and 'f' for float.
+
+#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \
+ __m256##T1 Vec256 = \
+ (__m256##T1)(__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, 2, 3, 4, 5, 6, 7) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 8, 9, 10, 11, 12, 13, 14, 15)); \
+ __m128##T1 Vec128 = \
+ (__m128##T1)(__builtin_shufflevector( \
+ (__v8s##T2)Vec256, \
+ (__v8s##T2)Vec256, \
+ 0, 1, 2, 3) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v8s##T2)Vec256, \
+ (__v8s##T2)Vec256, \
+ 4, 5, 6, 7)); \
+ Vec128 = (__m128##T1)(__builtin_shufflevector( \
+ (__v4s##T2)Vec128, \
+ (__v4s##T2)Vec128, \
+ 0, 1, -1, -1) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v4s##T2)Vec128, \
+ (__v4s##T2)Vec128, \
+ 2, 3, -1, -1)); \
+ Vec128 = (__m128##T1)(__builtin_shufflevector( \
+ (__v4s##T2)Vec128, \
+ (__v4s##T2)Vec128, \
+ 0, -1, -1, -1) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v4s##T2)Vec128, \
+ (__v4s##T2)Vec128, \
+ 1, -1, -1, -1)); \
+ return Vec128[0]; \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_add_epi32(__m512i __W) {
+ _mm512_reduce_operator_32bit(__W, +, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_mul_epi32(__m512i __W) {
+ _mm512_reduce_operator_32bit(__W, *, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_and_epi32(__m512i __W) {
+ _mm512_reduce_operator_32bit(__W, &, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_or_epi32(__m512i __W) {
+ _mm512_reduce_operator_32bit(__W, |, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_add_ps(__m512 __W) {
+ _mm512_reduce_operator_32bit(__W, +, f, );
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_mul_ps(__m512 __W) {
+ _mm512_reduce_operator_32bit(__W, *, f, );
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element.
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2 - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+// T3 - Can be Ps for packed single or d for d-word.
+
+#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \
+ Mask, T2, T1, T3) \
+ __extension__({ \
+ Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
+ (__mmask16)Mask, \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512Neutral); \
+ _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
+ i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
+}
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+// This macro uses only intrinsics from the AVX512F feature.
+
+// Vec512 - Vector with size of 512.
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+// __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+
+#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, 1, 2, 3, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 4, 5, 6, 7, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, 1, -1, -1, -1, -1, -1, -1),\
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 2, 3, -1, -1, -1, -1, -1, \
+ -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, -1, -1, -1, -1, -1, -1, -1),\
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 1, -1, -1, -1, -1, -1, -1, -1))\
+ ; \
+ return Vec512[0]; \
+ })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epi64(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu64(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_reduce_max_pd(__m512d __V) {
+ _mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
+(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu64(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_reduce_min_pd(__m512d __V) {
+ _mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x8000000000000000}
+// {max_epu,0x0000000000000000}
+// {max_pd, 0xFFF0000000000000}
+// {min_epi,0x7FFFFFFFFFFFFFFF}
+// {min_epu,0xFFFFFFFFFFFFFFFF}
+// {min_pd, 0x7FF0000000000000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+// __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+// [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
+ T2, T3, Mask) \
+ __extension__({ \
+ Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
+ (__mmask8)Mask, \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512Neutral); \
+ _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \
+ })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
+ max_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
+ max_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()),
+ max_pd, d, f, pd, __M);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+ min_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+ min_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()),
+ min_pd, d, f, pd, __M);
+}
+
+// Vec512 - Vector with size 512.
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+// __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+
+#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, 2, 3, 4, 5, 6, 7, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, 2, 3, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 4, 5, 6, 7, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 2, 3, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, -1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 1, -1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ return Vec512[0]; \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
+ _mm512_reduce_maxMin_32bit(a, max_ps, , f);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
+ _mm512_reduce_maxMin_32bit(a, min_ps, , f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x80000000}
+// {max_epu,0x00000000}
+// {max_ps, 0xFF800000}
+// {min_epi,0x7FFFFFFF}
+// {min_epu,0xFFFFFFFF}
+// {min_ps, 0x7F800000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+// __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+// [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
+ T2, T3, Mask) \
+ __extension__({ \
+ Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
+ (__mmask16)Mask, \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512Neutral); \
+ _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
+ i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
+ i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f,
+ ps, __M);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
+ i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
+ i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f,
+ ps, __M);
}
#undef __DEFAULT_FN_ATTRS