diff options
Diffstat (limited to 'lib/Headers/avx512fintrin.h')
-rw-r--r-- | lib/Headers/avx512fintrin.h | 2346 |
1 files changed, 1524 insertions, 822 deletions
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h index 0bf6582345d4..e6a7217c8967 100644 --- a/lib/Headers/avx512fintrin.h +++ b/lib/Headers/avx512fintrin.h @@ -54,6 +54,19 @@ typedef unsigned short __mmask16; #define _MM_FROUND_TO_ZERO 0x03 #define _MM_FROUND_CUR_DIRECTION 0x04 +/* Constants for integer comparison predicates */ +typedef enum { + _MM_CMPINT_EQ, /* Equal */ + _MM_CMPINT_LT, /* Less than */ + _MM_CMPINT_LE, /* Less than or Equal */ + _MM_CMPINT_UNUSED, + _MM_CMPINT_NE, /* Not Equal */ + _MM_CMPINT_NLT, /* Not Less than */ +#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */ + _MM_CMPINT_NLE /* Not Less than or Equal */ +#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */ +} _MM_CMPINT_ENUM; + typedef enum { _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, @@ -503,6 +516,18 @@ _mm512_castsi512_si256 (__m512i __A) return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); } +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_int2mask(int __a) +{ + return (__mmask16)__a; +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask2int(__mmask16 __a) +{ + return (int)__a; +} + /* Bitwise operators */ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_and_epi32(__m512i __a, __m512i __b) @@ -737,22 +762,19 @@ _mm512_add_epi64 (__m512i __A, __m512i __B) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_add_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_add_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -762,22 +784,19 @@ _mm512_sub_epi64 (__m512i __A, __m512i __B) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sub_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sub_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -787,22 +806,19 @@ _mm512_add_epi32 (__m512i __A, __m512i __B) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_add_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_add_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -812,22 +828,19 @@ _mm512_sub_epi32 (__m512i __A, __m512i __B) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sub_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sub_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } #define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \ @@ -1403,57 +1416,45 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) static __inline __m512i __DEFAULT_FN_ATTRS _mm512_mul_epi32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) __W, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epi32(__X, __Y), + (__v8di)__W); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epi32(__X, __Y), + (__v8di)_mm512_setzero_si512 ()); } static __inline __m512i __DEFAULT_FN_ATTRS _mm512_mul_epu32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) __W, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epu32(__X, __Y), + (__v8di)__W); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epu32(__X, __Y), + (__v8di)_mm512_setzero_si512 ()); } static __inline __m512i __DEFAULT_FN_ATTRS @@ -1463,21 +1464,19 @@ _mm512_mullo_epi32 (__m512i __A, __m512i __B) } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_mullo_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_mullo_epi32(__A, __B), + (__v16si)__W); } #define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \ @@ -1977,38 +1976,30 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_add_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_add_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_add_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_add_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } #define _mm512_add_round_pd(A, B, R) __extension__ ({ \ @@ -2120,40 +2111,30 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_sub_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_sub_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_sub_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_sub_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } #define _mm512_sub_round_pd(A, B, R) __extension__ ({ \ @@ -2265,40 +2246,30 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_mul_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_mul_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_mul_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_mul_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } #define _mm512_mul_round_pd(A, B, R) __extension__ ({ \ @@ -2417,21 +2388,16 @@ _mm512_div_pd(__m512d __a, __m512d __b) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_div_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_div_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline __m512 __DEFAULT_FN_ATTRS @@ -2442,21 +2408,16 @@ _mm512_div_ps(__m512 __a, __m512 __b) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_div_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_div_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } #define _mm512_div_round_pd(A, B, R) __extension__ ({ \ @@ -3443,71 +3404,94 @@ _mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A, } #define _mm512_alignr_epi64(A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(I), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(A), \ + ((int)(I) & 0x7) + 0, \ + ((int)(I) & 0x7) + 1, \ + ((int)(I) & 0x7) + 2, \ + ((int)(I) & 0x7) + 3, \ + ((int)(I) & 0x7) + 4, \ + ((int)(I) & 0x7) + 5, \ + ((int)(I) & 0x7) + 6, \ + ((int)(I) & 0x7) + 7); }) #define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\ - (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)(__m512i)(W)); }) #define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\ - (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512()); }) #define _mm512_alignr_epi32(A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(I), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) + (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(A), \ + ((int)(I) & 0xf) + 0, \ + ((int)(I) & 0xf) + 1, \ + ((int)(I) & 0xf) + 2, \ + ((int)(I) & 0xf) + 3, \ + ((int)(I) & 0xf) + 4, \ + ((int)(I) & 0xf) + 5, \ + ((int)(I) & 0xf) + 6, \ + ((int)(I) & 0xf) + 7, \ + ((int)(I) & 0xf) + 8, \ + ((int)(I) & 0xf) + 9, \ + ((int)(I) & 0xf) + 10, \ + ((int)(I) & 0xf) + 11, \ + ((int)(I) & 0xf) + 12, \ + ((int)(I) & 0xf) + 13, \ + ((int)(I) & 0xf) + 14, \ + ((int)(I) & 0xf) + 15); }) #define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\ - (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)(__m512i)(W)); }) #define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\ - (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512()); }) /* Vector Extract */ -#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ - (__v4df)_mm256_setzero_si256(), \ - (__mmask8)-1); }) +#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \ + (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A), \ + (__v8df)_mm512_undefined_pd(), \ + ((I) & 1) ? 4 : 0, \ + ((I) & 1) ? 5 : 1, \ + ((I) & 1) ? 6 : 2, \ + ((I) & 1) ? 7 : 3); }) #define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm512_extractf64x4_pd((A), (imm)), \ + (__v4df)(W)); }) #define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm512_extractf64x4_pd((A), (imm)), \ + (__v4df)_mm256_setzero_pd()); }) -#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1); }) +#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \ + (__m128)__builtin_shufflevector((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + 0 + ((I) & 0x3) * 4, \ + 1 + ((I) & 0x3) * 4, \ + 2 + ((I) & 0x3) * 4, \ + 3 + ((I) & 0x3) * 4); }) #define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U)); }) + (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \ + (__v4sf)(W)); }) #define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)); }) + (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \ + (__v4sf)_mm_setzero_ps()); }) + /* Vector Blend */ static __inline __m512d __DEFAULT_FN_ATTRS @@ -3556,10 +3540,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) #define _mm512_cmp_ps_mask(A, B, P) \ _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) - #define _mm512_mask_cmp_ps_mask(U, A, B, P) \ _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) +#define _mm512_cmpeq_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ) +#define _mm512_mask_cmpeq_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ) + +#define _mm512_cmplt_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS) +#define _mm512_mask_cmplt_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS) + +#define _mm512_cmple_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS) +#define _mm512_mask_cmple_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS) + +#define _mm512_cmpunord_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q) +#define _mm512_mask_cmpunord_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q) + +#define _mm512_cmpneq_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ) +#define _mm512_mask_cmpneq_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ) + +#define _mm512_cmpnlt_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US) +#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US) + +#define _mm512_cmpnle_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US) +#define _mm512_mask_cmpnle_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US) + +#define _mm512_cmpord_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q) +#define _mm512_mask_cmpord_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) + #define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \ (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(P), \ @@ -3572,10 +3595,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) #define _mm512_cmp_pd_mask(A, B, P) \ _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) - #define _mm512_mask_cmp_pd_mask(U, A, B, P) \ _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) +#define _mm512_cmpeq_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ) +#define _mm512_mask_cmpeq_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ) + +#define _mm512_cmplt_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS) +#define _mm512_mask_cmplt_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS) + +#define _mm512_cmple_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS) +#define _mm512_mask_cmple_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS) + +#define _mm512_cmpunord_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q) +#define _mm512_mask_cmpunord_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q) + +#define _mm512_cmpneq_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ) +#define _mm512_mask_cmpneq_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ) + +#define _mm512_cmpnlt_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US) +#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US) + +#define _mm512_cmpnle_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US) +#define _mm512_mask_cmpnle_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US) + +#define _mm512_cmpord_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q) +#define _mm512_mask_cmpord_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q) + /* Conversion */ #define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \ @@ -3682,26 +3744,35 @@ _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) static __inline __m512d __DEFAULT_FN_ATTRS _mm512_cvtepi32_pd(__m256i __A) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepi32_pd(__A), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepi32_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS +_mm512_cvtepi32lo_pd(__m512i __A) +{ + return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS +_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) +{ + return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); } static __inline__ __m512 __DEFAULT_FN_ATTRS @@ -3734,26 +3805,35 @@ _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) static __inline __m512d __DEFAULT_FN_ATTRS _mm512_cvtepu32_pd(__m256i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepu32_pd(__A), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepu32_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS +_mm512_cvtepu32lo_pd(__m512i __A) +{ + return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS +_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) +{ + return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); } #define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \ @@ -3798,6 +3878,24 @@ _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } +static __inline__ __m512 __DEFAULT_FN_ATTRS +_mm512_cvtpd_pslo (__m512d __A) +{ + return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), + (__v8sf) _mm256_setzero_ps (), + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS +_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) +{ + return (__m512) __builtin_shufflevector ( + (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), + __U, __A), + (__v8sf) _mm256_setzero_ps (), + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + #define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ (__v16hi)_mm256_undefined_si256(), \ @@ -4919,263 +5017,227 @@ _mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi8_epi32 (__m128i __A) +_mm512_cvtepi8_epi32(__m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi8_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A) +_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi8_epi32(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi8_epi64 (__m128i __A) +_mm512_cvtepi8_epi64(__m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi8_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi8_epi64(__A), + (__v8di)_mm512_setzero_si512 ()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi32_epi64 (__m256i __X) +_mm512_cvtepi32_epi64(__m256i __X) { - return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { - return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi32_epi64(__X), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X) +_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) { - return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi32_epi64(__X), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi16_epi32 (__m256i __A) +_mm512_cvtepi16_epi32(__m256i __A) { - return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi16_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A) +_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi16_epi32(__A), + (__v16si)_mm512_setzero_si512 ()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi16_epi64 (__m128i __A) +_mm512_cvtepi16_epi64(__m128i __A) { - return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi16_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi16_epi64(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu8_epi32 (__m128i __A) +_mm512_cvtepu8_epi32(__m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu8_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A) +_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu8_epi32(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu8_epi64 (__m128i __A) +_mm512_cvtepu8_epi64(__m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu8_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu8_epi64(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu32_epi64 (__m256i __X) +_mm512_cvtepu32_epi64(__m256i __X) { - return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { - return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu32_epi64(__X), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X) +_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) { - return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu32_epi64(__X), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu16_epi32 (__m256i __A) +_mm512_cvtepu16_epi32(__m256i __A) { - return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu16_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A) +_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu16_epi32(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu16_epi64 (__m128i __A) +_mm512_cvtepu16_epi64(__m128i __A) { - return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu16_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu16_epi64(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -5393,67 +5455,91 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U)); }) -#define _mm512_slli_epi32(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) - -#define _mm512_mask_slli_epi32(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) - -#define _mm512_maskz_slli_epi32(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_slli_epi32(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B); +} -#define _mm512_slli_epi64(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_slli_epi32(__A, __B), + (__v16si)__W); +} -#define _mm512_mask_slli_epi64(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_slli_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} -#define _mm512_maskz_slli_epi64(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_slli_epi64(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B); +} +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_slli_epi64(__A, __B), + (__v8di)__W); +} +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_slli_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} -#define _mm512_srli_epi32(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srli_epi32(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B); +} -#define _mm512_mask_srli_epi32(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srli_epi32(__A, __B), + (__v16si)__W); +} -#define _mm512_maskz_srli_epi32(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srli_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} -#define _mm512_srli_epi64(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srli_epi64(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B); +} -#define _mm512_mask_srli_epi64(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srli_epi64(__A, __B), + (__v8di)__W); +} -#define _mm512_maskz_srli_epi64(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srli_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) @@ -5911,8 +5997,10 @@ _mm512_kmov (__mmask16 __A) (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ (int)(P), (int)(R)); }) +#ifdef __x86_64__ #define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); }) +#endif static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, @@ -5926,351 +6014,267 @@ _mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sll_epi32 (__m512i __A, __m128i __B) +_mm512_sll_epi32(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sll_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sll_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sll_epi64 (__m512i __A, __m128i __B) +_mm512_sll_epi64(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sll_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sll_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sllv_epi32 (__m512i __X, __m512i __Y) +_mm512_sllv_epi32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sllv_epi32(__X, __Y), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sllv_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sllv_epi64 (__m512i __X, __m512i __Y) +_mm512_sllv_epi64(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sllv_epi64(__X, __Y), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sllv_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sra_epi32 (__m512i __A, __m128i __B) +_mm512_sra_epi32(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sra_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sra_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sra_epi64 (__m512i __A, __m128i __B) +_mm512_sra_epi64(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sra_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sra_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srav_epi32 (__m512i __X, __m512i __Y) +_mm512_srav_epi32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srav_epi32(__X, __Y), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srav_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srav_epi64 (__m512i __X, __m512i __Y) +_mm512_srav_epi64(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srav_epi64(__X, __Y), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srav_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srl_epi32 (__m512i __A, __m128i __B) +_mm512_srl_epi32(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srl_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srl_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srl_epi64 (__m512i __A, __m128i __B) +_mm512_srl_epi64(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srl_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srl_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srlv_epi32 (__m512i __X, __m512i __Y) +_mm512_srlv_epi32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srlv_epi32(__X, __Y), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srlv_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_srlv_epi64 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srlv_epi64(__X, __Y), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srlv_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); } #define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \ @@ -6309,8 +6313,10 @@ _mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)(__m512i)(C), (int)(imm), \ (__mmask8)(U)); }) +#ifdef __x86_64__ #define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); }) +#endif #define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); }) @@ -6328,6 +6334,7 @@ _mm_cvtsd_u32 (__m128d __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \ (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ (int)(R)); }) @@ -6339,6 +6346,7 @@ _mm_cvtsd_u64 (__m128d __A) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvt_roundss_si32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); }) @@ -6346,11 +6354,13 @@ _mm_cvtsd_u64 (__m128d __A) #define _mm_cvt_roundss_i32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); }) +#ifdef __x86_64__ #define _mm_cvt_roundss_si64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); }) #define _mm_cvt_roundss_i64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); }) +#endif #define _mm_cvt_roundss_u32(A, R) __extension__ ({ \ (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); }) @@ -6362,6 +6372,7 @@ _mm_cvtss_u32 (__m128 __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvt_roundss_u64(A, R) __extension__ ({ \ (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ (int)(R)); }) @@ -6373,6 +6384,7 @@ _mm_cvtss_u64 (__m128 __A) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); }) @@ -6387,6 +6399,7 @@ _mm_cvttsd_i32 (__m128d __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); }) @@ -6399,6 +6412,7 @@ _mm_cvttsd_i64 (__m128d __A) return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \ (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); }) @@ -6410,6 +6424,7 @@ _mm_cvttsd_u32 (__m128d __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \ (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ (int)(R)); }) @@ -6421,6 +6436,7 @@ _mm_cvttsd_u64 (__m128d __A) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); }) @@ -6435,6 +6451,7 @@ _mm_cvttss_i32 (__m128 __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); }) @@ -6447,6 +6464,7 @@ _mm_cvttss_i64 (__m128 __A) return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \ (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); }) @@ -6458,6 +6476,7 @@ _mm_cvttss_u32 (__m128 __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \ (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ (int)(R)); }) @@ -6469,6 +6488,7 @@ _mm_cvttss_u64 (__m128 __A) __A, _MM_FROUND_CUR_DIRECTION); } +#endif static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U, @@ -6556,61 +6576,47 @@ _mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I, (__v16sf)_mm512_setzero_ps()); }) static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_permutevar_pd (__m512d __A, __m512i __C) +_mm512_permutevar_pd(__m512d __A, __m512i __C) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) +_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) __W, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutevar_pd(__A, __C), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C) +_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutevar_pd(__A, __C), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_permutevar_ps (__m512 __A, __m512i __C) +_mm512_permutevar_ps(__m512 __A, __m512i __C) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) +_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) __W, - (__mmask16) __U); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutevar_ps(__A, __C), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C) +_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutevar_ps(__A, __C), + (__v16sf)_mm512_setzero_ps()); } static __inline __m512d __DEFAULT_FN_ATTRS @@ -7028,35 +7034,48 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) (__mmask8)(U), \ _MM_FROUND_CUR_DIRECTION); }) -#define _mm512_srai_epi32(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srai_epi32(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B); +} -#define _mm512_mask_srai_epi32(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \ + (__v16si)_mm512_srai_epi32(__A, __B), \ + (__v16si)__W); +} -#define _mm512_maskz_srai_epi32(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \ + (__v16si)_mm512_srai_epi32(__A, __B), \ + (__v16si)_mm512_setzero_si512()); +} -#define _mm512_srai_epi64(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srai_epi64(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B); +} -#define _mm512_mask_srai_epi64(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \ + (__v8di)_mm512_srai_epi64(__A, __B), \ + (__v8di)__W); +} -#define _mm512_maskz_srai_epi64(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \ + (__v8di)_mm512_srai_epi64(__A, __B), \ + (__v8di)_mm512_setzero_si512()); +} #define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \ (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \ @@ -7832,107 +7851,145 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); } -#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ - (__mmask8)-1); }) +#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \ + (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A), \ + (__v16si)_mm512_undefined_epi32(), \ + 0 + ((imm) & 0x3) * 4, \ + 1 + ((imm) & 0x3) * 4, \ + 2 + ((imm) & 0x3) * 4, \ + 3 + ((imm) & 0x3) * 4); }) #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \ + (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ + (__v4si)__W); }) #define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \ + (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ + (__v4si)_mm_setzero_si128()); }) -#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_undefined_si256(), \ - (__mmask8)-1); }) +#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \ + (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)_mm512_undefined_epi32(), \ + ((imm) & 1) ? 4 : 0, \ + ((imm) & 1) ? 5 : 1, \ + ((imm) & 1) ? 6 : 2, \ + ((imm) & 1) ? 7 : 3); }) #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ + (__v4di)__W); }) #define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ + (__v4di)_mm256_setzero_si256()); }) #define _mm512_insertf64x4(A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \ - (__v4df)(__m256d)(B), (int)(imm), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1); }) + (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ + (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \ + ((imm) & 0x1) ? 0 : 8, \ + ((imm) & 0x1) ? 1 : 9, \ + ((imm) & 0x1) ? 2 : 10, \ + ((imm) & 0x1) ? 3 : 11, \ + ((imm) & 0x1) ? 8 : 4, \ + ((imm) & 0x1) ? 9 : 5, \ + ((imm) & 0x1) ? 10 : 6, \ + ((imm) & 0x1) ? 11 : 7); }) #define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \ - (__v4df)(__m256d)(B), (int)(imm), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)(W)); }) #define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \ - (__v4df)(__m256d)(B), (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_inserti64x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)_mm512_castsi256_si512((__m256i)(B)), \ + ((imm) & 0x1) ? 0 : 8, \ + ((imm) & 0x1) ? 1 : 9, \ + ((imm) & 0x1) ? 2 : 10, \ + ((imm) & 0x1) ? 3 : 11, \ + ((imm) & 0x1) ? 8 : 4, \ + ((imm) & 0x1) ? 9 : 5, \ + ((imm) & 0x1) ? 10 : 6, \ + ((imm) & 0x1) ? 11 : 7); }) #define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)(W)); }) #define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512()); }) #define _mm512_insertf32x4(A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1); }) + (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_castps128_ps512((__m128)(B)),\ + (((imm) & 0x3) == 0) ? 16 : 0, \ + (((imm) & 0x3) == 0) ? 17 : 1, \ + (((imm) & 0x3) == 0) ? 18 : 2, \ + (((imm) & 0x3) == 0) ? 19 : 3, \ + (((imm) & 0x3) == 1) ? 16 : 4, \ + (((imm) & 0x3) == 1) ? 17 : 5, \ + (((imm) & 0x3) == 1) ? 18 : 6, \ + (((imm) & 0x3) == 1) ? 19 : 7, \ + (((imm) & 0x3) == 2) ? 16 : 8, \ + (((imm) & 0x3) == 2) ? 17 : 9, \ + (((imm) & 0x3) == 2) ? 18 : 10, \ + (((imm) & 0x3) == 2) ? 19 : 11, \ + (((imm) & 0x3) == 3) ? 16 : 12, \ + (((imm) & 0x3) == 3) ? 17 : 13, \ + (((imm) & 0x3) == 3) ? 18 : 14, \ + (((imm) & 0x3) == 3) ? 19 : 15); }) #define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)(W)); }) #define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_inserti32x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) + (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ + (__v16si)_mm512_castsi128_si512((__m128i)(B)),\ + (((imm) & 0x3) == 0) ? 16 : 0, \ + (((imm) & 0x3) == 0) ? 17 : 1, \ + (((imm) & 0x3) == 0) ? 18 : 2, \ + (((imm) & 0x3) == 0) ? 19 : 3, \ + (((imm) & 0x3) == 1) ? 16 : 4, \ + (((imm) & 0x3) == 1) ? 17 : 5, \ + (((imm) & 0x3) == 1) ? 18 : 6, \ + (((imm) & 0x3) == 1) ? 19 : 7, \ + (((imm) & 0x3) == 2) ? 16 : 8, \ + (((imm) & 0x3) == 2) ? 17 : 9, \ + (((imm) & 0x3) == 2) ? 18 : 10, \ + (((imm) & 0x3) == 2) ? 19 : 11, \ + (((imm) & 0x3) == 3) ? 16 : 12, \ + (((imm) & 0x3) == 3) ? 17 : 13, \ + (((imm) & 0x3) == 3) ? 18 : 14, \ + (((imm) & 0x3) == 3) ? 19 : 15); }) #define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)(W)); }) #define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512()); }) #define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \ (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ @@ -8275,17 +8332,17 @@ __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A, + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, (__v4sf) __B, - (__v4sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -8323,17 +8380,17 @@ _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A, + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, -(__v4sf) __B, - (__v4sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -8355,33 +8412,33 @@ _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, (__v4sf) __X, - -(__v4sf) __Y, + (__v4sf) __Y, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ + (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(X), \ - -(__v4sf)(__m128)(Y), (__mmask8)(U), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A, + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, (__v4sf) __B, - (__v4sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -8419,17 +8476,17 @@ _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A, + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, -(__v4sf) __B, - (__v4sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -8451,33 +8508,33 @@ _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W, + return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W, (__v4sf) __X, - -(__v4sf) __Y, + (__v4sf) __Y, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\ - (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \ + (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(X), \ - -(__v4sf)(__m128)(Y), (__mmask8)(U), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A, + return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, + (__v2df) __A, (__v2df) __B, - (__v2df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -8515,17 +8572,17 @@ _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A, + return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, + (__v2df) __A, -(__v2df) __B, - (__v2df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -8547,33 +8604,33 @@ _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, (__v2df) __X, - -(__v2df) __Y, + (__v2df) __Y, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ + (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ (__v2df)(__m128d)(X), \ - -(__v2df)(__m128d)(Y), \ + (__v2df)(__m128d)(Y), \ (__mmask8)(U), (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A, + return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, + -(__v2df) __A, (__v2df) __B, - (__v2df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -8611,17 +8668,17 @@ _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A, + return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, + -(__v2df) __A, -(__v2df) __B, - (__v2df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -8644,17 +8701,17 @@ _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) (__W), + return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W), (__v2df) __X, - -(__v2df) (__Y), + (__v2df) (__Y), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \ + (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \ (__v2df)(__m128d)(X), \ - -(__v2df)(__m128d)(Y), \ + (__v2df)(__m128d)(Y), \ (__mmask8)(U), (int)(R)); }) #define _mm512_permutex_pd(X, C) __extension__ ({ \ @@ -9041,6 +9098,101 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) (__v16sf)_mm512_setzero_ps()); } +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + __m128 res = __A; + res[0] = (__U & 1) ? __B[0] : __W[0]; + return res; +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + __m128 res = __A; + res[0] = (__U & 1) ? __B[0] : 0; + return res; +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + __m128d res = __A; + res[0] = (__U & 1) ? __B[0] : __W[0]; + return res; +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + __m128d res = __A; + res[0] = (__U & 1) ? __B[0] : 0; + return res; +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storess128_mask ((__v16sf *)__W, + (__v16sf) _mm512_castps128_ps512(__A), + (__mmask16) __U & (__mmask16)1); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storesd128_mask ((__v8df *)__W, + (__v8df) _mm512_castpd128_pd512(__A), + (__mmask8) __U & 1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) +{ + __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, + (__v4sf) {0.0, 0.0, 0.0, 0.0}, + 0, 4, 4, 4); + + return (__m128) __builtin_shufflevector( + __builtin_ia32_loadss128_mask ((__v16sf *) __A, + (__v16sf) _mm512_castps128_ps512(src), + (__mmask16) __U & 1), + _mm512_undefined_ps(), 0, 1, 2, 3); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_maskz_load_ss (__mmask8 __U, const float* __A) +{ + return (__m128) __builtin_shufflevector( + __builtin_ia32_loadss128_mask ((__v16sf *) __A, + (__v16sf) _mm512_setzero_ps(), + (__mmask16) __U & 1), + _mm512_undefined_ps(), 0, 1, 2, 3); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) +{ + __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, + (__v2df) {0.0, 0.0}, 0, 2); + + return (__m128d) __builtin_shufflevector( + __builtin_ia32_loadsd128_mask ((__v8df *) __A, + (__v8df) _mm512_castpd128_pd512(src), + (__mmask8) __U & 1), + _mm512_undefined_pd(), 0, 1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_maskz_load_sd (__mmask8 __U, const double* __A) +{ + return (__m128d) __builtin_shufflevector( + __builtin_ia32_loadsd128_mask ((__v8df *) __A, + (__v8df) _mm512_setzero_pd(), + (__mmask8) __U & 1), + _mm512_undefined_pd(), 0, 1); +} + #define _mm512_shuffle_epi32(A, I) __extension__ ({ \ (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ (__v16si)_mm512_undefined_epi32(), \ @@ -9243,6 +9395,18 @@ _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) _MM_FROUND_CUR_DIRECTION); } +static __inline__ __m512 __DEFAULT_FN_ATTRS +_mm512_cvtpslo_pd (__m512 __A) +{ + return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS +_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) +{ + return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); +} + static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) { @@ -9340,14 +9504,17 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) } #define _mm_cvtss_i32 _mm_cvtss_si32 -#define _mm_cvtss_i64 _mm_cvtss_si64 #define _mm_cvtsd_i32 _mm_cvtsd_si32 -#define _mm_cvtsd_i64 _mm_cvtsd_si64 #define _mm_cvti32_sd _mm_cvtsi32_sd -#define _mm_cvti64_sd _mm_cvtsi64_sd #define _mm_cvti32_ss _mm_cvtsi32_ss +#ifdef __x86_64__ +#define _mm_cvtss_i64 _mm_cvtss_si64 +#define _mm_cvtsd_i64 _mm_cvtsd_si64 +#define _mm_cvti64_sd _mm_cvtsi64_sd #define _mm_cvti64_ss _mm_cvtsi64_ss +#endif +#ifdef __x86_64__ #define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \ (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ (int)(R)); }) @@ -9355,6 +9522,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) #define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \ (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ (int)(R)); }) +#endif #define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); }) @@ -9362,6 +9530,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) #define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); }) +#ifdef __x86_64__ #define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ (int)(R)); }) @@ -9369,6 +9538,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) #define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ (int)(R)); }) +#endif #define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \ (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ @@ -9412,6 +9582,7 @@ _mm_cvtu32_sd (__m128d __A, unsigned __B) return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); } +#ifdef __x86_64__ #define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \ (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ (unsigned long long)(B), (int)(R)); }) @@ -9422,6 +9593,7 @@ _mm_cvtu64_sd (__m128d __A, unsigned long long __B) return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ @@ -9434,6 +9606,7 @@ _mm_cvtu32_ss (__m128 __A, unsigned __B) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ (unsigned long long)(B), (int)(R)); }) @@ -9444,6 +9617,7 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B) return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, _MM_FROUND_CUR_DIRECTION); } +#endif static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) @@ -9452,12 +9626,14 @@ _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) __M); } +#ifdef __x86_64__ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) { return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, __M); } +#endif static __inline __m512i __DEFAULT_FN_ATTRS _mm512_set_epi32 (int __A, int __B, int __C, int __D, @@ -9514,27 +9690,553 @@ _mm512_set_ps (float __A, float __B, float __C, float __D, (e4),(e3),(e2),(e1),(e0)) static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_abs_ps(__m512 A) +_mm512_abs_ps(__m512 __A) { - return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ; + return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_abs_ps(__m512 W, __mmask16 K, __m512 A) +_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) { - return (__m512)_mm512_mask_and_epi32((__m512i)W, K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ; + return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_abs_pd(__m512d A) +_mm512_abs_pd(__m512d __A) { - return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A) ; + return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_abs_pd(__m512d W, __mmask8 K, __m512d A) -{ - return (__m512d)_mm512_mask_and_epi64((__v8di)W, K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A); +_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) +{ + return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); +} + +// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as +// outputs. This class of vector operation forms the basis of many scientific +// computations. In vector-reduction arithmetic, the evaluation off is +// independent of the order of the input elements of V. + +// Used bisection method. At each step, we partition the vector with previous +// step in half, and the operation is performed on its two halves. +// This takes log2(n) steps where n is the number of elements in the vector. + +// Vec512 - Vector with size 512. +// Operator - Can be one of following: +,*,&,| +// T2 - Can get 'i' for int and 'f' for float. +// T1 - Can get 'i' for int and 'd' for double. + +#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \ + __extension__({ \ + __m256##T1 Vec256 = __builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, 1, 2, 3) \ + Operator \ + __builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 4, 5, 6, 7); \ + __m128##T1 Vec128 = __builtin_shufflevector( \ + (__v4d##T2)Vec256, \ + (__v4d##T2)Vec256, \ + 0, 1) \ + Operator \ + __builtin_shufflevector( \ + (__v4d##T2)Vec256, \ + (__v4d##T2)Vec256, \ + 2, 3); \ + Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \ + (__v2d##T2)Vec128, 0, -1) \ + Operator \ + __builtin_shufflevector((__v2d##T2)Vec128, \ + (__v2d##T2)Vec128, 1, -1); \ + return Vec128[0]; \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) { + _mm512_reduce_operator_64bit(__W, +, i, i); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) { + _mm512_reduce_operator_64bit(__W, *, i, i); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) { + _mm512_reduce_operator_64bit(__W, &, i, i); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) { + _mm512_reduce_operator_64bit(__W, |, i, i); +} + +static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) { + _mm512_reduce_operator_64bit(__W, +, f, d); +} + +static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) { + _mm512_reduce_operator_64bit(__W, *, f, d); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - All vector elements set to the identity element. +// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0} +// Operator - Can be one of following: +,*,&,| +// Mask - Intrinsic Mask +// T2 - Can get 'i' for int and 'f' for float. +// T1 - Can get 'i' for int and 'd' for packed double-precision. +// T3 - Can be Pd for packed double or q for q-word. + +#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \ + Mask, T2, T1, T3) \ + __extension__({ \ + Vec512 = __builtin_ia32_select##T3##_512( \ + (__mmask8)Mask, \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512Neutral); \ + _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), + &, __M, i, i, q); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, + i, i, q); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, + f, d, pd); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M, + f, d, pd); +} + +// Vec512 - Vector with size 512. +// Operator - Can be one of following: +,*,&,| +// T2 - Can get 'i' for int and ' ' for packed single. +// T1 - Can get 'i' for int and 'f' for float. + +#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \ + __m256##T1 Vec256 = \ + (__m256##T1)(__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, 2, 3, 4, 5, 6, 7) \ + Operator \ + __builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 8, 9, 10, 11, 12, 13, 14, 15)); \ + __m128##T1 Vec128 = \ + (__m128##T1)(__builtin_shufflevector( \ + (__v8s##T2)Vec256, \ + (__v8s##T2)Vec256, \ + 0, 1, 2, 3) \ + Operator \ + __builtin_shufflevector( \ + (__v8s##T2)Vec256, \ + (__v8s##T2)Vec256, \ + 4, 5, 6, 7)); \ + Vec128 = (__m128##T1)(__builtin_shufflevector( \ + (__v4s##T2)Vec128, \ + (__v4s##T2)Vec128, \ + 0, 1, -1, -1) \ + Operator \ + __builtin_shufflevector( \ + (__v4s##T2)Vec128, \ + (__v4s##T2)Vec128, \ + 2, 3, -1, -1)); \ + Vec128 = (__m128##T1)(__builtin_shufflevector( \ + (__v4s##T2)Vec128, \ + (__v4s##T2)Vec128, \ + 0, -1, -1, -1) \ + Operator \ + __builtin_shufflevector( \ + (__v4s##T2)Vec128, \ + (__v4s##T2)Vec128, \ + 1, -1, -1, -1)); \ + return Vec128[0]; \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_add_epi32(__m512i __W) { + _mm512_reduce_operator_32bit(__W, +, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_mul_epi32(__m512i __W) { + _mm512_reduce_operator_32bit(__W, *, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_and_epi32(__m512i __W) { + _mm512_reduce_operator_32bit(__W, &, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_or_epi32(__m512i __W) { + _mm512_reduce_operator_32bit(__W, |, i, i); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_reduce_add_ps(__m512 __W) { + _mm512_reduce_operator_32bit(__W, +, f, ); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_reduce_mul_ps(__m512 __W) { + _mm512_reduce_operator_32bit(__W, *, f, ); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - All vector elements set to the identity element. +// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0} +// Operator - Can be one of following: +,*,&,| +// Mask - Intrinsic Mask +// T2 - Can get 'i' for int and 'f' for float. +// T1 - Can get 'i' for int and 'd' for double. +// T3 - Can be Ps for packed single or d for d-word. + +#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \ + Mask, T2, T1, T3) \ + __extension__({ \ + Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ + (__mmask16)Mask, \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512Neutral); \ + _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, + i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps); +} + +// Used bisection method. At each step, we partition the vector with previous +// step in half, and the operation is performed on its two halves. +// This takes log2(n) steps where n is the number of elements in the vector. +// This macro uses only intrinsics from the AVX512F feature. + +// Vec512 - Vector with size of 512. +// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example: +// __mm512_max_epi64 +// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}] +// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}] + +#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, 1, 2, 3, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 4, 5, 6, 7, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, 1, -1, -1, -1, -1, -1, -1),\ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 2, 3, -1, -1, -1, -1, -1, \ + -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, -1, -1, -1, -1, -1, -1, -1),\ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 1, -1, -1, -1, -1, -1, -1, -1))\ + ; \ + return Vec512[0]; \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_reduce_max_epi64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_reduce_max_epu64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_reduce_max_pd(__m512d __V) { + _mm512_reduce_maxMin_64bit(__V, max_pd, d, f); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64 +(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_reduce_min_epu64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_reduce_min_pd(__m512d __V) { + _mm512_reduce_maxMin_64bit(__V, min_pd, d, f); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - A 512 length vector with elements set to the identity element +// Identity element: {max_epi,0x8000000000000000} +// {max_epu,0x0000000000000000} +// {max_pd, 0xFFF0000000000000} +// {min_epi,0x7FFFFFFFFFFFFFFF} +// {min_epu,0xFFFFFFFFFFFFFFFF} +// {min_pd, 0x7FF0000000000000} +// +// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example: +// __mm512_max_epi64 +// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}] +// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}] +// T3 - Can get 'q' q word and 'pd' for packed double. +// [__builtin_ia32_select{q|pd}_512] +// Mask - Intrinsic Mask + +#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \ + T2, T3, Mask) \ + __extension__({ \ + Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ + (__mmask8)Mask, \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512Neutral); \ + _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000), + max_epi64, i, i, q, __M); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000), + max_epu64, i, i, q, __M); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { + _mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()), + max_pd, d, f, pd, __M); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), + min_epi64, i, i, q, __M); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), + min_epu64, i, i, q, __M); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()), + min_pd, d, f, pd, __M); +} + +// Vec512 - Vector with size 512. +// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example: +// __mm512_max_epi32 +// T1 - Can get 'i' for int and ' ' .[__m512{i|}] +// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}] + +#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, 2, 3, 4, 5, 6, 7, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, 2, 3, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 4, 5, 6, 7, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 2, 3, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, -1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 1, -1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + return Vec512[0]; \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, max_epi32, i, i); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_reduce_max_epu32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, max_epu32, i, i); +} + +static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) { + _mm512_reduce_maxMin_32bit(a, max_ps, , f); +} + +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, min_epi32, i, i); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_reduce_min_epu32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, min_epu32, i, i); +} + +static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) { + _mm512_reduce_maxMin_32bit(a, min_ps, , f); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - A 512 length vector with elements set to the identity element +// Identity element: {max_epi,0x80000000} +// {max_epu,0x00000000} +// {max_ps, 0xFF800000} +// {min_epi,0x7FFFFFFF} +// {min_epu,0xFFFFFFFF} +// {min_ps, 0x7F800000} +// +// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example: +// __mm512_max_epi32 +// T1 - Can get 'i' for int and ' ' .[__m512{i|}] +// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}] +// T3 - Can get 'q' q word and 'pd' for packed double. +// [__builtin_ia32_select{q|pd}_512] +// Mask - Intrinsic Mask + +#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \ + T2, T3, Mask) \ + __extension__({ \ + Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ + (__mmask16)Mask, \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512Neutral); \ + _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32, + i, i, d, __M); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32, + i, i, d, __M); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { + _mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f, + ps, __M); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32, + i, i, d, __M); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32, + i, i, d, __M); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f, + ps, __M); } #undef __DEFAULT_FN_ATTRS |