diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2019-01-19 10:04:05 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2019-01-19 10:04:05 +0000 |
| commit | 676fbe8105eeb6ff4bb2ed261cb212fcfdbe7b63 (patch) | |
| tree | 02a1ac369cb734d0abfa5000dd86e5b7797e6a74 /lib/Headers | |
| parent | c7e70c433efc6953dc3888b9fbf9f3512d7da2b0 (diff) | |
Vendor import of clang trunk r351319 (just before the release_80 branchvendor/clang/clang-trunk-r351319
Diffstat (limited to 'lib/Headers')
| -rw-r--r-- | lib/Headers/CMakeLists.txt | 6 | ||||
| -rw-r--r-- | lib/Headers/__clang_cuda_runtime_wrapper.h | 8 | ||||
| -rw-r--r-- | lib/Headers/adxintrin.h | 4 | ||||
| -rw-r--r-- | lib/Headers/altivec.h | 123 | ||||
| -rw-r--r-- | lib/Headers/avx512bwintrin.h | 839 | ||||
| -rw-r--r-- | lib/Headers/avx512dqintrin.h | 302 | ||||
| -rw-r--r-- | lib/Headers/avx512fintrin.h | 177 | ||||
| -rw-r--r-- | lib/Headers/avx512pfintrin.h | 32 | ||||
| -rw-r--r-- | lib/Headers/avx512vbmi2intrin.h | 158 | ||||
| -rw-r--r-- | lib/Headers/avx512vbmiintrin.h | 26 | ||||
| -rw-r--r-- | lib/Headers/avx512vbmivlintrin.h | 56 | ||||
| -rw-r--r-- | lib/Headers/avx512vlbwintrin.h | 75 | ||||
| -rw-r--r-- | lib/Headers/avx512vlintrin.h | 349 | ||||
| -rw-r--r-- | lib/Headers/avx512vlvbmi2intrin.h | 312 | ||||
| -rw-r--r-- | lib/Headers/bmiintrin.h | 10 | ||||
| -rw-r--r-- | lib/Headers/cuda_wrappers/new | 6 | ||||
| -rw-r--r-- | lib/Headers/emmintrin.h | 107 | ||||
| -rw-r--r-- | lib/Headers/float.h | 12 | ||||
| -rw-r--r-- | lib/Headers/immintrin.h | 59 | ||||
| -rw-r--r-- | lib/Headers/intrin.h | 546 | ||||
| -rw-r--r-- | lib/Headers/lzcntintrin.h | 22 | ||||
| -rw-r--r-- | lib/Headers/opencl-c.h | 659 | ||||
| -rw-r--r-- | lib/Headers/unwind.h | 4 | ||||
| -rw-r--r-- | lib/Headers/vecintrin.h | 6 |
24 files changed, 2530 insertions, 1368 deletions
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt index 1930d8e225c7..e444c9c8706f 100644 --- a/lib/Headers/CMakeLists.txt +++ b/lib/Headers/CMakeLists.txt @@ -144,7 +144,7 @@ foreach( f ${files} ${cuda_wrapper_files} ) list(APPEND out_files ${dst}) endforeach( f ) -add_custom_command(OUTPUT ${output_dir}/arm_neon.h +add_custom_command(OUTPUT ${output_dir}/arm_neon.h DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h COMMENT "Copying clang's arm_neon.h...") @@ -156,7 +156,9 @@ add_custom_command(OUTPUT ${output_dir}/arm_fp16.h list(APPEND out_files ${output_dir}/arm_fp16.h) add_custom_target(clang-headers ALL DEPENDS ${out_files}) -set_target_properties(clang-headers PROPERTIES FOLDER "Misc") +set_target_properties(clang-headers PROPERTIES + FOLDER "Misc" + RUNTIME_OUTPUT_DIRECTORY "${output_dir}") install( FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h index 09705a273a47..f05c0454a883 100644 --- a/lib/Headers/__clang_cuda_runtime_wrapper.h +++ b/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -62,10 +62,15 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9020 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 10000 #error "Unsupported CUDA version!" #endif +#pragma push_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__") +#if CUDA_VERSION >= 10000 +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#endif + // Make largest subset of device functions available during host // compilation -- SM_35 for the time being. #ifndef __CUDA_ARCH__ @@ -419,6 +424,7 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const { #pragma pop_macro("dim3") #pragma pop_macro("uint3") #pragma pop_macro("__USE_FAST_MATH__") +#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__") #endif // __CUDA__ #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__ diff --git a/lib/Headers/adxintrin.h b/lib/Headers/adxintrin.h index ee347284178e..d6c454db8512 100644 --- a/lib/Headers/adxintrin.h +++ b/lib/Headers/adxintrin.h @@ -53,7 +53,7 @@ static __inline unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y, unsigned int *__p) { - return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p); + return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p); } #ifdef __x86_64__ @@ -61,7 +61,7 @@ static __inline unsigned char __DEFAULT_FN_ATTRS _addcarry_u64(unsigned char __cf, unsigned long long __x, unsigned long long __y, unsigned long long *__p) { - return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p); + return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p); } #endif diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h index 90fd477d9b98..2dc6adb9009f 100644 --- a/lib/Headers/altivec.h +++ b/lib/Headers/altivec.h @@ -9492,49 +9492,51 @@ vec_splat_u32(signed char __a) { /* vec_sr */ -static __inline__ vector signed char __ATTRS_o_ai -vec_sr(vector signed char __a, vector unsigned char __b) { - vector unsigned char __res = (vector unsigned char)__a >> __b; - return (vector signed char)__res; -} - +// vec_sr does modulo arithmetic on __b first, so __b is allowed to be more +// than the length of __a. static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b) { - return __a >> __b; + return __a >> + (__b % (vector unsigned char)(sizeof(unsigned char) * __CHAR_BIT__)); } -static __inline__ vector signed short __ATTRS_o_ai -vec_sr(vector signed short __a, vector unsigned short __b) { - vector unsigned short __res = (vector unsigned short)__a >> __b; - return (vector signed short)__res; +static __inline__ vector signed char __ATTRS_o_ai +vec_sr(vector signed char __a, vector unsigned char __b) { + return (vector signed char)vec_sr((vector unsigned char)__a, __b); } static __inline__ vector unsigned short __ATTRS_o_ai vec_sr(vector unsigned short __a, vector unsigned short __b) { - return __a >> __b; + return __a >> + (__b % (vector unsigned short)(sizeof(unsigned short) * __CHAR_BIT__)); } -static __inline__ vector signed int __ATTRS_o_ai -vec_sr(vector signed int __a, vector unsigned int __b) { - vector unsigned int __res = (vector unsigned int)__a >> __b; - return (vector signed int)__res; +static __inline__ vector short __ATTRS_o_ai vec_sr(vector short __a, + vector unsigned short __b) { + return (vector short)vec_sr((vector unsigned short)__a, __b); } static __inline__ vector unsigned int __ATTRS_o_ai vec_sr(vector unsigned int __a, vector unsigned int __b) { - return __a >> __b; + return __a >> + (__b % (vector unsigned int)(sizeof(unsigned int) * __CHAR_BIT__)); } -#ifdef __POWER8_VECTOR__ -static __inline__ vector signed long long __ATTRS_o_ai -vec_sr(vector signed long long __a, vector unsigned long long __b) { - vector unsigned long long __res = (vector unsigned long long)__a >> __b; - return (vector signed long long)__res; +static __inline__ vector int __ATTRS_o_ai vec_sr(vector int __a, + vector unsigned int __b) { + return (vector int)vec_sr((vector unsigned int)__a, __b); } +#ifdef __POWER8_VECTOR__ static __inline__ vector unsigned long long __ATTRS_o_ai vec_sr(vector unsigned long long __a, vector unsigned long long __b) { - return __a >> __b; + return __a >> (__b % (vector unsigned long long)(sizeof(unsigned long long) * + __CHAR_BIT__)); +} + +static __inline__ vector long long __ATTRS_o_ai +vec_sr(vector long long __a, vector unsigned long long __b) { + return (vector long long)vec_sr((vector unsigned long long)__a, __b); } #endif @@ -9544,12 +9546,12 @@ vec_sr(vector unsigned long long __a, vector unsigned long long __b) { static __inline__ vector signed char __ATTRS_o_ai vec_vsrb(vector signed char __a, vector unsigned char __b) { - return __a >> (vector signed char)__b; + return vec_sr(__a, __b); } static __inline__ vector unsigned char __ATTRS_o_ai vec_vsrb(vector unsigned char __a, vector unsigned char __b) { - return __a >> __b; + return vec_sr(__a, __b); } /* vec_vsrh */ @@ -9558,12 +9560,12 @@ vec_vsrb(vector unsigned char __a, vector unsigned char __b) { static __inline__ vector short __ATTRS_o_ai vec_vsrh(vector short __a, vector unsigned short __b) { - return __a >> (vector short)__b; + return vec_sr(__a, __b); } static __inline__ vector unsigned short __ATTRS_o_ai vec_vsrh(vector unsigned short __a, vector unsigned short __b) { - return __a >> __b; + return vec_sr(__a, __b); } /* vec_vsrw */ @@ -9572,12 +9574,12 @@ vec_vsrh(vector unsigned short __a, vector unsigned short __b) { static __inline__ vector int __ATTRS_o_ai vec_vsrw(vector int __a, vector unsigned int __b) { - return __a >> (vector int)__b; + return vec_sr(__a, __b); } static __inline__ vector unsigned int __ATTRS_o_ai vec_vsrw(vector unsigned int __a, vector unsigned int __b) { - return __a >> __b; + return vec_sr(__a, __b); } /* vec_sra */ @@ -16353,67 +16355,82 @@ vec_revb(vector unsigned __int128 __a) { /* vec_xl */ +typedef vector signed char unaligned_vec_schar __attribute__((aligned(1))); +typedef vector unsigned char unaligned_vec_uchar __attribute__((aligned(1))); +typedef vector signed short unaligned_vec_sshort __attribute__((aligned(1))); +typedef vector unsigned short unaligned_vec_ushort __attribute__((aligned(1))); +typedef vector signed int unaligned_vec_sint __attribute__((aligned(1))); +typedef vector unsigned int unaligned_vec_uint __attribute__((aligned(1))); +typedef vector float unaligned_vec_float __attribute__((aligned(1))); + static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset, signed char *__ptr) { - return *(vector signed char *)(__ptr + __offset); + return *(unaligned_vec_schar *)(__ptr + __offset); } static inline __ATTRS_o_ai vector unsigned char vec_xl(signed long long __offset, unsigned char *__ptr) { - return *(vector unsigned char *)(__ptr + __offset); + return *(unaligned_vec_uchar*)(__ptr + __offset); } static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset, signed short *__ptr) { - return *(vector signed short *)(__ptr + __offset); + return *(unaligned_vec_sshort *)(__ptr + __offset); } static inline __ATTRS_o_ai vector unsigned short vec_xl(signed long long __offset, unsigned short *__ptr) { - return *(vector unsigned short *)(__ptr + __offset); + return *(unaligned_vec_ushort *)(__ptr + __offset); } static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset, signed int *__ptr) { - return *(vector signed int *)(__ptr + __offset); + return *(unaligned_vec_sint *)(__ptr + __offset); } static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset, unsigned int *__ptr) { - return *(vector unsigned int *)(__ptr + __offset); + return *(unaligned_vec_uint *)(__ptr + __offset); } static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset, float *__ptr) { - return *(vector float *)(__ptr + __offset); + return *(unaligned_vec_float *)(__ptr + __offset); } #ifdef __VSX__ +typedef vector signed long long unaligned_vec_sll __attribute__((aligned(1))); +typedef vector unsigned long long unaligned_vec_ull __attribute__((aligned(1))); +typedef vector double unaligned_vec_double __attribute__((aligned(1))); + static inline __ATTRS_o_ai vector signed long long vec_xl(signed long long __offset, signed long long *__ptr) { - return *(vector signed long long *)(__ptr + __offset); + return *(unaligned_vec_sll *)(__ptr + __offset); } static inline __ATTRS_o_ai vector unsigned long long vec_xl(signed long long __offset, unsigned long long *__ptr) { - return *(vector unsigned long long *)(__ptr + __offset); + return *(unaligned_vec_ull *)(__ptr + __offset); } static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset, double *__ptr) { - return *(vector double *)(__ptr + __offset); + return *(unaligned_vec_double *)(__ptr + __offset); } #endif #if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +typedef vector signed __int128 unaligned_vec_si128 __attribute__((aligned(1))); +typedef vector unsigned __int128 unaligned_vec_ui128 + __attribute__((aligned(1))); static inline __ATTRS_o_ai vector signed __int128 vec_xl(signed long long __offset, signed __int128 *__ptr) { - return *(vector signed __int128 *)(__ptr + __offset); + return *(unaligned_vec_si128 *)(__ptr + __offset); } static inline __ATTRS_o_ai vector unsigned __int128 vec_xl(signed long long __offset, unsigned __int128 *__ptr) { - return *(vector unsigned __int128 *)(__ptr + __offset); + return *(unaligned_vec_ui128 *)(__ptr + __offset); } #endif @@ -16498,62 +16515,62 @@ vec_xl_be(signed long long __offset, unsigned __int128 *__ptr) { static inline __ATTRS_o_ai void vec_xst(vector signed char __vec, signed long long __offset, signed char *__ptr) { - *(vector signed char *)(__ptr + __offset) = __vec; + *(unaligned_vec_schar *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector unsigned char __vec, signed long long __offset, unsigned char *__ptr) { - *(vector unsigned char *)(__ptr + __offset) = __vec; + *(unaligned_vec_uchar *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector signed short __vec, signed long long __offset, signed short *__ptr) { - *(vector signed short *)(__ptr + __offset) = __vec; + *(unaligned_vec_sshort *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec, signed long long __offset, unsigned short *__ptr) { - *(vector unsigned short *)(__ptr + __offset) = __vec; + *(unaligned_vec_ushort *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector signed int __vec, signed long long __offset, signed int *__ptr) { - *(vector signed int *)(__ptr + __offset) = __vec; + *(unaligned_vec_sint *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector unsigned int __vec, signed long long __offset, unsigned int *__ptr) { - *(vector unsigned int *)(__ptr + __offset) = __vec; + *(unaligned_vec_uint *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector float __vec, signed long long __offset, float *__ptr) { - *(vector float *)(__ptr + __offset) = __vec; + *(unaligned_vec_float *)(__ptr + __offset) = __vec; } #ifdef __VSX__ static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec, signed long long __offset, signed long long *__ptr) { - *(vector signed long long *)(__ptr + __offset) = __vec; + *(unaligned_vec_sll *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec, signed long long __offset, unsigned long long *__ptr) { - *(vector unsigned long long *)(__ptr + __offset) = __vec; + *(unaligned_vec_ull *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector double __vec, signed long long __offset, double *__ptr) { - *(vector double *)(__ptr + __offset) = __vec; + *(unaligned_vec_double *)(__ptr + __offset) = __vec; } #endif @@ -16561,13 +16578,13 @@ static inline __ATTRS_o_ai void vec_xst(vector double __vec, static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec, signed long long __offset, signed __int128 *__ptr) { - *(vector signed __int128 *)(__ptr + __offset) = __vec; + *(unaligned_vec_si128 *)(__ptr + __offset) = __vec; } static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec, signed long long __offset, unsigned __int128 *__ptr) { - *(vector unsigned __int128 *)(__ptr + __offset) = __vec; + *(unaligned_vec_ui128 *)(__ptr + __offset) = __vec; } #endif diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h index fc4632374977..a90a255376c0 100644 --- a/lib/Headers/avx512bwintrin.h +++ b/lib/Headers/avx512bwintrin.h @@ -32,7 +32,216 @@ typedef unsigned int __mmask32; typedef unsigned long long __mmask64; /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"))) + +static __inline __mmask32 __DEFAULT_FN_ATTRS +_knot_mask32(__mmask32 __M) +{ + return __builtin_ia32_knotsi(__M); +} + +static __inline __mmask64 __DEFAULT_FN_ATTRS +_knot_mask64(__mmask64 __M) +{ + return __builtin_ia32_knotdi(__M); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kand_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kand_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kandn_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kandn_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kor_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kor_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kxnor_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kxnor_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kxor_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kxor_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) +{ + return (unsigned char)__builtin_ia32_kortestcsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) +{ + return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) +{ + return (unsigned char)__builtin_ia32_kortestcdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) +{ + return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) +{ + return (unsigned char)__builtin_ia32_ktestcsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) +{ + return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) +{ + return (unsigned char)__builtin_ia32_ktestcdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) +{ + return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kadd_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kadd_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B); +} + +#define _kshiftli_mask32(A, I) \ + (__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)) + +#define _kshiftri_mask32(A, I) \ + (__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)) + +#define _kshiftli_mask64(A, I) \ + (__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)) + +#define _kshiftri_mask64(A, I) \ + (__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask32_u32(__mmask32 __A) { + return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_cvtmask64_u64(__mmask64 __A) { + return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_cvtu32_mask32(unsigned int __A) { + return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_cvtu64_mask64(unsigned long long __A) { + return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_load_mask32(__mmask32 *__A) { + return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_load_mask64(__mmask64 *__A) { + return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask32(__mmask32 *__A, __mmask32 __B) { + *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask64(__mmask64 *__A, __mmask64 __B) { + *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B); +} /* Integer compare */ @@ -176,102 +385,102 @@ typedef unsigned long long __mmask64; #define _mm512_mask_cmpneq_epu16_mask(k, A, B) \ _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_add_epi8 (__m512i __A, __m512i __B) { return (__m512i) ((__v64qu) __A + (__v64qu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_add_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_add_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sub_epi8 (__m512i __A, __m512i __B) { return (__m512i) ((__v64qu) __A - (__v64qu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_sub_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_sub_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_add_epi16 (__m512i __A, __m512i __B) { return (__m512i) ((__v32hu) __A + (__v32hu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_add_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_add_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sub_epi16 (__m512i __A, __m512i __B) { return (__m512i) ((__v32hu) __A - (__v32hu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sub_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sub_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mullo_epi16 (__m512i __A, __m512i __B) { return (__m512i) ((__v32hu) __A * (__v32hu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_mullo_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_mullo_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W) { return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, @@ -279,7 +488,7 @@ _mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W) (__v64qi) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) { return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, @@ -287,13 +496,13 @@ _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) (__v32hi) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_abs_epi8 (__m512i __A) { return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -301,7 +510,7 @@ _mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -309,13 +518,13 @@ _mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_abs_epi16 (__m512i __A) { return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -323,7 +532,7 @@ _mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -331,13 +540,13 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_packs_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -345,7 +554,7 @@ _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -353,13 +562,13 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_packs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -367,7 +576,7 @@ _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -375,13 +584,13 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_packus_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -389,7 +598,7 @@ _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -397,13 +606,13 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_packus_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -411,7 +620,7 @@ _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -419,119 +628,95 @@ _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_adds_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_adds_epi8(__A, __B), + (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_adds_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_adds_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_si512(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_adds_epi16(__A, __B), + (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_si512(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_adds_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_adds_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_adds_epu8(__A, __B), + (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_adds_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_adds_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_si512(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_adds_epu16(__A, __B), + (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_si512(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_adds_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_avg_epu8 (__m512i __A, __m512i __B) { typedef unsigned short __v64hu __attribute__((__vector_size__(128))); @@ -541,7 +726,7 @@ _mm512_avg_epu8 (__m512i __A, __m512i __B) >> 1, __v64qu); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { @@ -550,7 +735,7 @@ _mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -558,7 +743,7 @@ _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_avg_epu16 (__m512i __A, __m512i __B) { typedef unsigned int __v32su __attribute__((__vector_size__(128))); @@ -568,7 +753,7 @@ _mm512_avg_epu16 (__m512i __A, __m512i __B) >> 1, __v32hu); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { @@ -577,7 +762,7 @@ _mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -585,13 +770,13 @@ _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) (__v32hi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_max_epi8 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -599,7 +784,7 @@ _mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -607,13 +792,13 @@ _mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_max_epi16 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -621,7 +806,7 @@ _mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { @@ -630,13 +815,13 @@ _mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_max_epu8 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -644,7 +829,7 @@ _mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -652,13 +837,13 @@ _mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_max_epu16 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -666,7 +851,7 @@ _mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -674,13 +859,13 @@ _mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_min_epi8 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -688,7 +873,7 @@ _mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -696,13 +881,13 @@ _mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_min_epi16 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -710,7 +895,7 @@ _mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -718,13 +903,13 @@ _mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_min_epu8 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -732,7 +917,7 @@ _mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -740,13 +925,13 @@ _mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_min_epu16 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -754,7 +939,7 @@ _mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -762,13 +947,13 @@ _mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_shuffle_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -776,7 +961,7 @@ _mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -784,126 +969,102 @@ _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_subs_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_subs_epi8(__A, __B), + (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_subs_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_subs_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_si512(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_subs_epi16(__A, __B), + (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_si512(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_subs_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_subs_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_subs_epu8(__A, __B), + (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_subs_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_subs_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_si512(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_subs_epu16(__A, __B), + (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_si512(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_subs_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, __m512i __B) { @@ -912,7 +1073,7 @@ _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, __m512i __B) { @@ -921,7 +1082,7 @@ _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, (__v32hi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, __m512i __B) { @@ -930,13 +1091,13 @@ _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mulhrs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -944,7 +1105,7 @@ _mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -952,13 +1113,13 @@ _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mulhi_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { @@ -967,7 +1128,7 @@ _mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -975,13 +1136,13 @@ _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mulhi_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -989,7 +1150,7 @@ _mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -997,12 +1158,12 @@ _mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maddubs_epi16(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, @@ -1010,114 +1171,114 @@ _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, (__v32hi)_mm512_maddubs_epi16(__X, __Y), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_madd_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_madd_epi16(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_madd_epi16(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsepi16_epi8 (__m512i __A) { return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, (__v32qi)_mm256_setzero_si256(), (__mmask32) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, (__v32qi)__O, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, (__v32qi) _mm256_setzero_si256(), __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtusepi16_epi8 (__m512i __A) { return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, (__v32qi) _mm256_setzero_si256(), (__mmask32) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, (__v32qi) __O, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, (__v32qi) _mm256_setzero_si256(), __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtepi16_epi8 (__m512i __A) { return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, (__v32qi) _mm256_undefined_si256(), (__mmask32) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, (__v32qi) __O, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, (__v32qi) _mm256_setzero_si256(), __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) { __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) { __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) { __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B, 8, 64+8, 9, 64+9, @@ -1138,21 +1299,21 @@ _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { 62, 64+62, 63, 64+63); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpackhi_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpackhi_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B, 4, 32+4, 5, 32+5, @@ -1165,21 +1326,21 @@ _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { 30, 32+30, 31, 32+31); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpackhi_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpackhi_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B, 0, 64+0, 1, 64+1, @@ -1200,21 +1361,21 @@ _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { 54, 64+54, 55, 64+55); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpacklo_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpacklo_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B, 0, 32+0, 1, 32+1, @@ -1227,21 +1388,21 @@ _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { 26, 32+26, 27, 32+27); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpacklo_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpacklo_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepi8_epi16(__m256i __A) { /* This function always performs a signed extension, but __v32qi is a char @@ -1249,7 +1410,7 @@ _mm512_cvtepi8_epi16(__m256i __A) return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1257,7 +1418,7 @@ _mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1265,13 +1426,13 @@ _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepu8_epi16(__m256i __A) { return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1279,7 +1440,7 @@ _mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1320,13 +1481,13 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) (imm)), \ (__v32hi)_mm512_setzero_si512()) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sllv_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1334,7 +1495,7 @@ _mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1342,13 +1503,13 @@ _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sll_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1356,7 +1517,7 @@ _mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1364,13 +1525,13 @@ _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_slli_epi16(__m512i __A, int __B) { return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1378,7 +1539,7 @@ _mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1389,13 +1550,13 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B) #define _mm512_bslli_epi128(a, imm) \ (__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srlv_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1403,7 +1564,7 @@ _mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1411,13 +1572,13 @@ _mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srav_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1425,7 +1586,7 @@ _mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1433,13 +1594,13 @@ _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sra_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1447,7 +1608,7 @@ _mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1455,13 +1616,13 @@ _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srai_epi16(__m512i __A, int __B) { return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1469,7 +1630,7 @@ _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1477,13 +1638,13 @@ _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srl_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1491,7 +1652,7 @@ _mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1499,13 +1660,13 @@ _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srli_epi16(__m512i __A, int __B) { return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1513,7 +1674,7 @@ _mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1524,7 +1685,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) #define _mm512_bsrli_epi128(a, imm) \ (__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, @@ -1532,7 +1693,7 @@ _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) (__v32hi) __W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, @@ -1540,7 +1701,7 @@ _mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) (__v32hi) _mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, @@ -1548,7 +1709,7 @@ _mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) (__v64qi) __W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, @@ -1556,7 +1717,7 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) (__v64qi) _mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) { return (__m512i) __builtin_ia32_selectb_512(__M, @@ -1564,7 +1725,7 @@ _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) (__v64qi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_set1_epi8 (__mmask64 __M, char __A) { return (__m512i) __builtin_ia32_selectb_512(__M, @@ -1572,21 +1733,30 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A) (__v64qi) _mm512_setzero_si512()); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, (__mmask64) __B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, (__mmask32) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_epi16 (void const *__P) +{ + struct __loadu_epi16 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi16*)__P)->__v; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P, @@ -1594,7 +1764,7 @@ _mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) (__mmask32) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P, @@ -1603,7 +1773,16 @@ _mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) (__mmask32) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_epi8 (void const *__P) +{ + struct __loadu_epi8 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi8*)__P)->__v; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P, @@ -1611,7 +1790,7 @@ _mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) (__mmask64) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P, @@ -1619,7 +1798,17 @@ _mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) _mm512_setzero_si512 (), (__mmask64) __U); } -static __inline__ void __DEFAULT_FN_ATTRS + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_epi16 (void *__P, __m512i __A) +{ + struct __storeu_epi16 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi16*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) { __builtin_ia32_storedquhi512_mask ((__v32hi *) __P, @@ -1627,7 +1816,16 @@ _mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) (__mmask32) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_epi8 (void *__P, __m512i __A) +{ + struct __storeu_epi8 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi8*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) { __builtin_ia32_storedquqi512_mask ((__v64qi *) __P, @@ -1635,86 +1833,86 @@ _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) (__mmask64) __U); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 _mm512_test_epi8_mask (__m512i __A, __m512i __B) { return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 _mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 _mm512_test_epi16_mask (__m512i __A, __m512i __B) { return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 _mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 _mm512_testn_epi8_mask (__m512i __A, __m512i __B) { return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 _mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 _mm512_testn_epi16_mask (__m512i __A, __m512i __B) { return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 _mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 _mm512_movepi8_mask (__m512i __A) { return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 _mm512_movepi16_mask (__m512i __A) { return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_movm_epi8 (__mmask64 __A) { return (__m512i) __builtin_ia32_cvtmask2b512 (__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_movm_epi16 (__mmask32 __A) { return (__m512i) __builtin_ia32_cvtmask2w512 (__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcastb_epi8 (__m128i __A) { return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A, @@ -1724,7 +1922,7 @@ _mm512_broadcastb_epi8 (__m128i __A) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectb_512(__M, @@ -1732,7 +1930,7 @@ _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) (__v64qi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectb_512(__M, @@ -1740,7 +1938,7 @@ _mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) (__v64qi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) { return (__m512i) __builtin_ia32_selectw_512(__M, @@ -1748,7 +1946,7 @@ _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) (__v32hi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_set1_epi16 (__mmask32 __M, short __A) { return (__m512i) __builtin_ia32_selectw_512(__M, @@ -1756,7 +1954,7 @@ _mm512_maskz_set1_epi16 (__mmask32 __M, short __A) (__v32hi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcastw_epi16 (__m128i __A) { return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A, @@ -1764,7 +1962,7 @@ _mm512_broadcastw_epi16 (__m128i __A) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectw_512(__M, @@ -1772,7 +1970,7 @@ _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) (__v32hi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectw_512(__M, @@ -1780,13 +1978,13 @@ _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) (__v32hi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_permutexvar_epi16 (__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { @@ -1795,7 +1993,7 @@ _mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { @@ -1832,15 +2030,14 @@ _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ (__v32hi)_mm512_setzero_si512()) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sad_epu8 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A, (__v64qi) __B); } - - +#undef __DEFAULT_FN_ATTRS512 #undef __DEFAULT_FN_ATTRS #endif diff --git a/lib/Headers/avx512dqintrin.h b/lib/Headers/avx512dqintrin.h index 8a00b3afa9d5..6e6c293af22e 100644 --- a/lib/Headers/avx512dqintrin.h +++ b/lib/Headers/avx512dqintrin.h @@ -29,180 +29,309 @@ #define __AVX512DQINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"))) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline __mmask8 __DEFAULT_FN_ATTRS +_knot_mask8(__mmask8 __M) +{ + return __builtin_ia32_knotqi(__M); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kand_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kandn_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kor_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kxnor_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kxor_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) +{ + return (unsigned char)__builtin_ia32_kortestcqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) +{ + return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) +{ + return (unsigned char)__builtin_ia32_ktestcqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) +{ + return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) +{ + return (unsigned char)__builtin_ia32_ktestchi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) +{ + return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kadd_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_kadd_mask16(__mmask16 __A, __mmask16 __B) +{ + return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B); +} + +#define _kshiftli_mask8(A, I) \ + (__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)) + +#define _kshiftri_mask8(A, I) \ + (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask8_u32(__mmask8 __A) { + return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_cvtu32_mask8(unsigned int __A) { + return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_load_mask8(__mmask8 *__A) { + return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask8(__mmask8 *__A, __mmask8 __B) { + *(__mmask8 *)__A = __builtin_ia32_kmovb((__mmask8)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mullo_epi64 (__m512i __A, __m512i __B) { return (__m512i) ((__v8du) __A * (__v8du) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_mullo_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_mullo_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_xor_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8du)__A ^ (__v8du)__B); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_xor_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_xor_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_xor_ps (__m512 __A, __m512 __B) { return (__m512)((__v16su)__A ^ (__v16su)__B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_xor_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_xor_ps(__A, __B), (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_or_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8du)__A | (__v8du)__B); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_or_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_or_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_or_ps(__m512 __A, __m512 __B) { return (__m512)((__v16su)__A | (__v16su)__B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_or_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_or_ps(__A, __B), (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_and_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8du)__A & (__v8du)__B); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_and_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_and_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_and_ps(__m512 __A, __m512 __B) { return (__m512)((__v16su)__A & (__v16su)__B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_and_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_and_ps(__A, __B), (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_andnot_pd(__m512d __A, __m512d __B) { return (__m512d)(~(__v8du)__A & (__v8du)__B); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_andnot_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_andnot_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_andnot_ps(__m512 __A, __m512 __B) { return (__m512)(~(__v16su)__A & (__v16su)__B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_andnot_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_andnot_ps(__A, __B), (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtpd_epi64 (__m512d __A) { return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512(), @@ -210,7 +339,7 @@ _mm512_cvtpd_epi64 (__m512d __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, (__v8di) __W, @@ -218,7 +347,7 @@ _mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) { return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512(), @@ -241,7 +370,7 @@ _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) { (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtpd_epu64 (__m512d __A) { return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512(), @@ -249,7 +378,7 @@ _mm512_cvtpd_epu64 (__m512d __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, (__v8di) __W, @@ -257,7 +386,7 @@ _mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) { return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512(), @@ -280,7 +409,7 @@ _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) { (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtps_epi64 (__m256 __A) { return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512(), @@ -288,7 +417,7 @@ _mm512_cvtps_epi64 (__m256 __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, (__v8di) __W, @@ -296,7 +425,7 @@ _mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) { return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512(), @@ -319,7 +448,7 @@ _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) { (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtps_epu64 (__m256 __A) { return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512(), @@ -327,7 +456,7 @@ _mm512_cvtps_epu64 (__m256 __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, (__v8di) __W, @@ -335,7 +464,7 @@ _mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) { return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512(), @@ -359,19 +488,19 @@ _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) { (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtepi64_pd (__m512i __A) { return (__m512d)__builtin_convertvector((__v8di)__A, __v8df); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_cvtepi64_pd(__A), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_cvtepi64_pd(__A), @@ -393,7 +522,7 @@ _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { (__v8df)_mm512_setzero_pd(), \ (__mmask8)(U), (int)(R)) -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_cvtepi64_ps (__m512i __A) { return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, (__v8sf) _mm256_setzero_ps(), @@ -401,7 +530,7 @@ _mm512_cvtepi64_ps (__m512i __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) { return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, (__v8sf) __W, @@ -409,7 +538,7 @@ _mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) { return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, (__v8sf) _mm256_setzero_ps(), @@ -433,7 +562,7 @@ _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) { (__mmask8)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttpd_epi64 (__m512d __A) { return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512(), @@ -441,7 +570,7 @@ _mm512_cvttpd_epi64 (__m512d __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, (__v8di) __W, @@ -449,7 +578,7 @@ _mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) { return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512(), @@ -472,7 +601,7 @@ _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) { (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttpd_epu64 (__m512d __A) { return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512(), @@ -480,7 +609,7 @@ _mm512_cvttpd_epu64 (__m512d __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, (__v8di) __W, @@ -488,7 +617,7 @@ _mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) { return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512(), @@ -511,7 +640,7 @@ _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) { (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttps_epi64 (__m256 __A) { return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512(), @@ -519,7 +648,7 @@ _mm512_cvttps_epi64 (__m256 __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, (__v8di) __W, @@ -527,7 +656,7 @@ _mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) { return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512(), @@ -550,7 +679,7 @@ _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) { (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttps_epu64 (__m256 __A) { return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512(), @@ -558,7 +687,7 @@ _mm512_cvttps_epu64 (__m256 __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, (__v8di) __W, @@ -566,7 +695,7 @@ _mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) { return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512(), @@ -589,19 +718,19 @@ _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) { (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtepu64_pd (__m512i __A) { return (__m512d)__builtin_convertvector((__v8du)__A, __v8df); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_cvtepu64_pd(__A), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_cvtepu64_pd(__A), @@ -625,7 +754,7 @@ _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { (__mmask8)(U), (int)(R)) -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_cvtepu64_ps (__m512i __A) { return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, (__v8sf) _mm256_setzero_ps(), @@ -633,7 +762,7 @@ _mm512_cvtepu64_ps (__m512i __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) { return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, (__v8sf) __W, @@ -641,7 +770,7 @@ _mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, (__v8sf) _mm256_setzero_ps(), @@ -935,32 +1064,32 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(C), (int)(R)) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_movepi32_mask (__m512i __A) { return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_movm_epi32 (__mmask16 __A) { return (__m512i) __builtin_ia32_cvtmask2d512 (__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_movm_epi64 (__mmask8 __A) { return (__m512i) __builtin_ia32_cvtmask2q512 (__A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 _mm512_movepi64_mask (__m512i __A) { return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_broadcast_f32x2 (__m128 __A) { return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, @@ -968,7 +1097,7 @@ _mm512_broadcast_f32x2 (__m128 __A) 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -976,7 +1105,7 @@ _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) (__v16sf)__O); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -984,7 +1113,7 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_broadcast_f32x8(__m256 __A) { return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A, @@ -992,7 +1121,7 @@ _mm512_broadcast_f32x8(__m256 __A) 0, 1, 2, 3, 4, 5, 6, 7); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -1000,7 +1129,7 @@ _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) (__v16sf)__O); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -1008,14 +1137,14 @@ _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_broadcast_f64x2(__m128d __A) { return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, @@ -1023,7 +1152,7 @@ _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) (__v8df)__O); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, @@ -1031,7 +1160,7 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcast_i32x2 (__m128i __A) { return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, @@ -1039,7 +1168,7 @@ _mm512_broadcast_i32x2 (__m128i __A) 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1047,7 +1176,7 @@ _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) (__v16si)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1055,7 +1184,7 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcast_i32x8(__m256i __A) { return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A, @@ -1063,7 +1192,7 @@ _mm512_broadcast_i32x8(__m256i __A) 0, 1, 2, 3, 4, 5, 6, 7); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1071,7 +1200,7 @@ _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) (__v16si)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1079,14 +1208,14 @@ _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcast_i64x2(__m128i __A) { return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -1094,7 +1223,7 @@ _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) (__v8di)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -1256,6 +1385,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ (__mmask8)(U)) +#undef __DEFAULT_FN_ATTRS512 #undef __DEFAULT_FN_ATTRS #endif diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h index 8dd4a0a40eea..1c19993ff1bb 100644 --- a/lib/Headers/avx512fintrin.h +++ b/lib/Headers/avx512fintrin.h @@ -175,6 +175,7 @@ typedef enum /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512))) #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) /* Create vectors with repeated elements */ @@ -508,13 +509,13 @@ _mm512_castsi512_si256 (__m512i __A) return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_int2mask(int __a) { return (__mmask16)__a; } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS _mm512_mask2int(__mmask16 __a) { return (int)__a; @@ -4329,6 +4330,15 @@ _mm512_loadu_si512 (void const *__P) } static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_epi32 (void const *__P) +{ + struct __loadu_epi32 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi32*)__P)->__v; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, @@ -4347,6 +4357,15 @@ _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) } static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_epi64 (void const *__P) +{ + struct __loadu_epi64 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi64*)__P)->__v; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, @@ -4482,6 +4501,15 @@ _mm512_load_epi64 (void const *__P) /* SIMD store ops */ static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_epi64 (void *__P, __m512i __A) +{ + struct __storeu_epi64 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi64*)__P)->__v = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) { __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A, @@ -4498,6 +4526,15 @@ _mm512_storeu_si512 (void *__P, __m512i __A) } static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_epi32 (void *__P, __m512i __A) +{ + struct __storeu_epi32 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi32*)__P)->__v = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) { __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A, @@ -4580,7 +4617,7 @@ _mm512_store_epi64 (void *__P, __m512i __A) /* Mask ops */ -static __inline __mmask16 __DEFAULT_FN_ATTRS512 +static __inline __mmask16 __DEFAULT_FN_ATTRS _mm512_knot(__mmask16 __M) { return __builtin_ia32_knothi(__M); @@ -5622,7 +5659,7 @@ _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R)) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kmov (__mmask16 __A) { return __A; @@ -7593,177 +7630,177 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) #define _mm512_i64gather_ps(index, addr, scale) \ (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ - (float const *)(addr), \ + (void const *)(addr), \ (__v8di)(__m512i)(index), (__mmask8)-1, \ (int)(scale)) #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ - (float const *)(addr), \ + (void const *)(addr), \ (__v8di)(__m512i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm512_i64gather_epi32(index, addr, scale) \ (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ - (int const *)(addr), \ + (void const *)(addr), \ (__v8di)(__m512i)(index), \ (__mmask8)-1, (int)(scale)) #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ - (int const *)(addr), \ + (void const *)(addr), \ (__v8di)(__m512i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm512_i64gather_pd(index, addr, scale) \ (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ - (double const *)(addr), \ + (void const *)(addr), \ (__v8di)(__m512i)(index), (__mmask8)-1, \ (int)(scale)) #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ - (double const *)(addr), \ + (void const *)(addr), \ (__v8di)(__m512i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm512_i64gather_epi64(index, addr, scale) \ (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ - (long long const *)(addr), \ + (void const *)(addr), \ (__v8di)(__m512i)(index), (__mmask8)-1, \ (int)(scale)) #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ - (long long const *)(addr), \ + (void const *)(addr), \ (__v8di)(__m512i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm512_i32gather_ps(index, addr, scale) \ (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ - (float const *)(addr), \ + (void const *)(addr), \ (__v16sf)(__m512)(index), \ (__mmask16)-1, (int)(scale)) #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ - (float const *)(addr), \ + (void const *)(addr), \ (__v16sf)(__m512)(index), \ (__mmask16)(mask), (int)(scale)) #define _mm512_i32gather_epi32(index, addr, scale) \ (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ - (int const *)(addr), \ + (void const *)(addr), \ (__v16si)(__m512i)(index), \ (__mmask16)-1, (int)(scale)) #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ - (int const *)(addr), \ + (void const *)(addr), \ (__v16si)(__m512i)(index), \ (__mmask16)(mask), (int)(scale)) #define _mm512_i32gather_pd(index, addr, scale) \ (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ - (double const *)(addr), \ + (void const *)(addr), \ (__v8si)(__m256i)(index), (__mmask8)-1, \ (int)(scale)) #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ - (double const *)(addr), \ + (void const *)(addr), \ (__v8si)(__m256i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm512_i32gather_epi64(index, addr, scale) \ (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ - (long long const *)(addr), \ + (void const *)(addr), \ (__v8si)(__m256i)(index), (__mmask8)-1, \ (int)(scale)) #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ - (long long const *)(addr), \ + (void const *)(addr), \ (__v8si)(__m256i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm512_i64scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \ (__v8di)(__m512i)(index), \ (__v8sf)(__m256)(v1), (int)(scale)) #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \ (__v8di)(__m512i)(index), \ (__v8sf)(__m256)(v1), (int)(scale)) #define _mm512_i64scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \ (__v8di)(__m512i)(index), \ (__v8si)(__m256i)(v1), (int)(scale)) #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \ (__v8di)(__m512i)(index), \ (__v8si)(__m256i)(v1), (int)(scale)) #define _mm512_i64scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \ (__v8di)(__m512i)(index), \ (__v8df)(__m512d)(v1), (int)(scale)) #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \ (__v8di)(__m512i)(index), \ (__v8df)(__m512d)(v1), (int)(scale)) #define _mm512_i64scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \ (__v8di)(__m512i)(index), \ (__v8di)(__m512i)(v1), (int)(scale)) #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \ (__v8di)(__m512i)(index), \ (__v8di)(__m512i)(v1), (int)(scale)) #define _mm512_i32scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)-1, \ + __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \ (__v16si)(__m512i)(index), \ (__v16sf)(__m512)(v1), (int)(scale)) #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)(mask), \ + __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \ (__v16si)(__m512i)(index), \ (__v16sf)(__m512)(v1), (int)(scale)) #define _mm512_i32scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)-1, \ + __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \ (__v16si)(__m512i)(index), \ (__v16si)(__m512i)(v1), (int)(scale)) #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)(mask), \ + __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \ (__v16si)(__m512i)(index), \ (__v16si)(__m512i)(v1), (int)(scale)) #define _mm512_i32scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \ (__v8si)(__m256i)(index), \ (__v8df)(__m512d)(v1), (int)(scale)) #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \ (__v8si)(__m256i)(index), \ (__v8df)(__m512d)(v1), (int)(scale)) #define _mm512_i32scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \ (__v8si)(__m256i)(index), \ (__v8di)(__m512i)(v1), (int)(scale)) #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \ (__v8si)(__m256i)(index), \ (__v8di)(__m512i)(v1), (int)(scale)) @@ -8320,54 +8357,105 @@ _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kand (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kandn (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kor (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS _mm512_kortestc (__mmask16 __A, __mmask16 __B) { return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS _mm512_kortestz (__mmask16 __A, __mmask16 __B) { return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) +{ + return (unsigned char)__builtin_ia32_kortestchi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) +{ + return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kxnor (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kxor (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); } +#define _kand_mask16 _mm512_kand +#define _kandn_mask16 _mm512_kandn +#define _knot_mask16 _mm512_knot +#define _kor_mask16 _mm512_kor +#define _kxnor_mask16 _mm512_kxnor +#define _kxor_mask16 _mm512_kxor + +#define _kshiftli_mask16(A, I) \ + (__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)) + +#define _kshiftri_mask16(A, I) \ + (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask16_u32(__mmask16 __A) { + return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_cvtu32_mask16(unsigned int __A) { + return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_load_mask16(__mmask16 *__A) { + return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask16(__mmask16 *__A, __mmask16 __B) { + *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B); +} + static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_stream_si512 (__m512i * __P, __m512i __A) { @@ -9594,5 +9682,6 @@ _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { #undef __DEFAULT_FN_ATTRS512 #undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS #endif /* __AVX512FINTRIN_H */ diff --git a/lib/Headers/avx512pfintrin.h b/lib/Headers/avx512pfintrin.h index 5b8260b77c63..73b2234fb410 100644 --- a/lib/Headers/avx512pfintrin.h +++ b/lib/Headers/avx512pfintrin.h @@ -33,78 +33,78 @@ #define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \ __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ - (long long const *)(addr), (int)(scale), \ + (void const *)(addr), (int)(scale), \ (int)(hint)) #define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \ __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \ - (long long const *)(addr), (int)(scale), \ + (void const *)(addr), (int)(scale), \ (int)(hint)) #define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \ __builtin_ia32_gatherpfdps((__mmask16)(mask), \ - (__v16si)(__m512i)(index), (int const *)(addr), \ + (__v16si)(__m512i)(index), (void const *)(addr), \ (int)(scale), (int)(hint)) #define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \ __builtin_ia32_gatherpfdps((__mmask16) -1, \ - (__v16si)(__m512i)(index), (int const *)(addr), \ + (__v16si)(__m512i)(index), (void const *)(addr), \ (int)(scale), (int)(hint)) #define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \ __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (long long const *)(addr), (int)(scale), \ + (void const *)(addr), (int)(scale), \ (int)(hint)) #define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \ __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \ - (long long const *)(addr), (int)(scale), \ + (void const *)(addr), (int)(scale), \ (int)(hint)) #define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \ __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (int const *)(addr), (int)(scale), (int)(hint)) + (void const *)(addr), (int)(scale), (int)(hint)) #define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \ __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \ - (int const *)(addr), (int)(scale), (int)(hint)) + (void const *)(addr), (int)(scale), (int)(hint)) #define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \ __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \ - (long long *)(addr), (int)(scale), \ + (void *)(addr), (int)(scale), \ (int)(hint)) #define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \ __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ - (long long *)(addr), (int)(scale), \ + (void *)(addr), (int)(scale), \ (int)(hint)) #define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \ __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \ - (int *)(addr), (int)(scale), (int)(hint)) + (void *)(addr), (int)(scale), (int)(hint)) #define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \ __builtin_ia32_scatterpfdps((__mmask16)(mask), \ - (__v16si)(__m512i)(index), (int *)(addr), \ + (__v16si)(__m512i)(index), (void *)(addr), \ (int)(scale), (int)(hint)) #define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \ __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \ - (long long *)(addr), (int)(scale), \ + (void *)(addr), (int)(scale), \ (int)(hint)) #define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \ __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (long long *)(addr), (int)(scale), \ + (void *)(addr), (int)(scale), \ (int)(hint)) #define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \ __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \ - (int *)(addr), (int)(scale), (int)(hint)) + (void *)(addr), (int)(scale), (int)(hint)) #define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \ __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (int *)(addr), (int)(scale), (int)(hint)) + (void *)(addr), (int)(scale), (int)(hint)) #undef __DEFAULT_FN_ATTRS diff --git a/lib/Headers/avx512vbmi2intrin.h b/lib/Headers/avx512vbmi2intrin.h index d2a58094fd07..53242524293f 100644 --- a/lib/Headers/avx512vbmi2intrin.h +++ b/lib/Headers/avx512vbmi2intrin.h @@ -227,167 +227,141 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) (__v32hi)_mm512_setzero_si512()) static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S, - (__v8di) __A, - (__v8di) __B, - __U); + return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B, + (__v8di)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B) +_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S, - (__v8di) __A, - (__v8di) __B, - __U); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_shldv_epi64(__A, __B, __C), + (__v8di)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B) +_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S, - (__v8di) __A, - (__v8di) __B, - (__mmask8) -1); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_shldv_epi64(__A, __B, __C), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - __U); + return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B, + (__v16si)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_shldv_epi32(__A, __B, __C), + (__v16si)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B) +_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_shldv_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); } - static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) +_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S, - (__v32hi) __A, - (__v32hi) __B, - __U); + return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B, + (__v32hi)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B) +_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S, - (__v32hi) __A, - (__v32hi) __B, - __U); + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_shldv_epi16(__A, __B, __C), + (__v32hi)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B) +_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S, - (__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_shldv_epi16(__A, __B, __C), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S, - (__v8di) __A, - (__v8di) __B, - __U); + return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B, + (__v8di)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B) +_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S, - (__v8di) __A, - (__v8di) __B, - __U); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_shrdv_epi64(__A, __B, __C), + (__v8di)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B) +_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S, - (__v8di) __A, - (__v8di) __B, - (__mmask8) -1); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_shrdv_epi64(__A, __B, __C), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - __U); + return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B, + (__v16si)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - __U); + return (__m512i) __builtin_ia32_selectd_512(__U, + (__v16si)_mm512_shrdv_epi32(__A, __B, __C), + (__v16si)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B) +_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512i) __builtin_ia32_selectd_512(__U, + (__v16si)_mm512_shrdv_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); } - static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) +_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S, - (__v32hi) __A, - (__v32hi) __B, - __U); + return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B, + (__v32hi)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B) +_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S, - (__v32hi) __A, - (__v32hi) __B, - __U); + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), + (__v32hi)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B) +_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S, - (__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), + (__v32hi)_mm512_setzero_si512()); } diff --git a/lib/Headers/avx512vbmiintrin.h b/lib/Headers/avx512vbmiintrin.h index b6e93c285871..5463d9015504 100644 --- a/lib/Headers/avx512vbmiintrin.h +++ b/lib/Headers/avx512vbmiintrin.h @@ -91,30 +91,26 @@ _mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y) +_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X, + __m512i __Y) { - return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v64qi) _mm512_setzero_si512 (), - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y) +_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v64qi) _mm512_undefined_epi32 (), - (__mmask64) -1); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), + (__v64qi)_mm512_setzero_si512()); } diff --git a/lib/Headers/avx512vbmivlintrin.h b/lib/Headers/avx512vbmivlintrin.h index 9a0400b2b5d5..b5d5aa9af523 100644 --- a/lib/Headers/avx512vbmivlintrin.h +++ b/lib/Headers/avx512vbmivlintrin.h @@ -150,61 +150,49 @@ _mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) +_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, - (__v16qi) __Y, - (__v16qi) __W, - (__mmask16) __M); + return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y) +_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, + __m128i __Y) { - return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, - (__v16qi) __Y, - (__v16qi) - _mm_setzero_si128 (), - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y) +_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, - (__v16qi) __Y, - (__v16qi) - _mm_undefined_si128 (), - (__mmask16) -1); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y) +_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, - (__v32qi) __Y, - (__v32qi) __W, - (__mmask32) __M); + return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y) +_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, + __m256i __Y) { - return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, - (__v32qi) __Y, - (__v32qi) - _mm256_setzero_si256 (), - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y) +_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, - (__v32qi) __Y, - (__v32qi) - _mm256_undefined_si256 (), - (__mmask32) -1); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), + (__v32qi)_mm256_setzero_si256()); } diff --git a/lib/Headers/avx512vlbwintrin.h b/lib/Headers/avx512vlbwintrin.h index 1b038dd04df6..87e0023e8b74 100644 --- a/lib/Headers/avx512vlbwintrin.h +++ b/lib/Headers/avx512vlbwintrin.h @@ -2297,6 +2297,15 @@ _mm256_maskz_set1_epi8 (__mmask32 __M, char __A) (__v32qi) _mm256_setzero_si256()); } +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_loadu_epi16 (void const *__P) +{ + struct __loadu_epi16 { + __m128i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi16*)__P)->__v; +} + static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) { @@ -2314,6 +2323,15 @@ _mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P) (__mmask8) __U); } +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadu_epi16 (void const *__P) +{ + struct __loadu_epi16 { + __m256i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi16*)__P)->__v; +} + static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) { @@ -2331,6 +2349,15 @@ _mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) (__mmask16) __U); } +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_loadu_epi8 (void const *__P) +{ + struct __loadu_epi8 { + __m128i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi8*)__P)->__v; +} + static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) { @@ -2348,6 +2375,15 @@ _mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P) (__mmask16) __U); } +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadu_epi8 (void const *__P) +{ + struct __loadu_epi8 { + __m256i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi8*)__P)->__v; +} + static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) { @@ -2364,7 +2400,17 @@ _mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) _mm256_setzero_si256 (), (__mmask32) __U); } -static __inline__ void __DEFAULT_FN_ATTRS256 + +static __inline void __DEFAULT_FN_ATTRS128 +_mm_storeu_epi16 (void *__P, __m128i __A) +{ + struct __storeu_epi16 { + __m128i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi16*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_storedquhi128_mask ((__v8hi *) __P, @@ -2372,6 +2418,15 @@ _mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_storeu_epi16 (void *__P, __m256i __A) +{ + struct __storeu_epi16 { + __m256i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi16*)__P)->__v = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) { @@ -2380,6 +2435,15 @@ _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) (__mmask16) __U); } +static __inline void __DEFAULT_FN_ATTRS128 +_mm_storeu_epi8 (void *__P, __m128i __A) +{ + struct __storeu_epi8 { + __m128i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi8*)__P)->__v = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) { @@ -2388,6 +2452,15 @@ _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) (__mmask16) __U); } +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_storeu_epi8 (void *__P, __m256i __A) +{ + struct __storeu_epi8 { + __m256i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi8*)__P)->__v = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) { diff --git a/lib/Headers/avx512vlintrin.h b/lib/Headers/avx512vlintrin.h index 0ee1d00ef4d2..a2cdc0a96e59 100644 --- a/lib/Headers/avx512vlintrin.h +++ b/lib/Headers/avx512vlintrin.h @@ -462,10 +462,16 @@ _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) } static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_and_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a & (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_and_si256(__A, __B), + (__v8si)_mm256_and_epi32(__A, __B), (__v8si)__W); } @@ -476,10 +482,16 @@ _mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_and_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a & (__v4su)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_and_si128(__A, __B), + (__v4si)_mm_and_epi32(__A, __B), (__v4si)__W); } @@ -490,10 +502,16 @@ _mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B) } static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_andnot_epi32(__m256i __A, __m256i __B) +{ + return (__m256i)(~(__v8su)__A & (__v8su)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_andnot_si256(__A, __B), + (__v8si)_mm256_andnot_epi32(__A, __B), (__v8si)__W); } @@ -505,24 +523,36 @@ _mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_andnot_epi32(__m128i __A, __m128i __B) +{ + return (__m128i)(~(__v4su)__A & (__v4su)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_andnot_si128(__A, __B), + (__v4si)_mm_andnot_epi32(__A, __B), (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_or_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a | (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_or_si256(__A, __B), + (__v8si)_mm256_or_epi32(__A, __B), (__v8si)__W); } @@ -533,10 +563,16 @@ _mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_or_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a | (__v4su)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_or_si128(__A, __B), + (__v4si)_mm_or_epi32(__A, __B), (__v4si)__W); } @@ -547,10 +583,16 @@ _mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B) } static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_xor_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a ^ (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_xor_si256(__A, __B), + (__v8si)_mm256_xor_epi32(__A, __B), (__v8si)__W); } @@ -561,11 +603,16 @@ _mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_xor_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a ^ (__v4su)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_xor_si128(__A, __B), + (__v4si)_mm_xor_epi32(__A, __B), (__v4si)__W); } @@ -576,10 +623,16 @@ _mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B) } static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_and_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a & (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_and_si256(__A, __B), + (__v4di)_mm256_and_epi64(__A, __B), (__v4di)__W); } @@ -590,10 +643,16 @@ _mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_and_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a & (__v2du)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_and_si128(__A, __B), + (__v2di)_mm_and_epi64(__A, __B), (__v2di)__W); } @@ -604,10 +663,16 @@ _mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B) } static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_andnot_epi64(__m256i __A, __m256i __B) +{ + return (__m256i)(~(__v4du)__A & (__v4du)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_andnot_si256(__A, __B), + (__v4di)_mm256_andnot_epi64(__A, __B), (__v4di)__W); } @@ -619,10 +684,16 @@ _mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_andnot_epi64(__m128i __A, __m128i __B) +{ + return (__m128i)(~(__v2du)__A & (__v2du)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_andnot_si128(__A, __B), + (__v2di)_mm_andnot_epi64(__A, __B), (__v2di)__W); } @@ -633,10 +704,16 @@ _mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B) } static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_or_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a | (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_or_si256(__A, __B), + (__v4di)_mm256_or_epi64(__A, __B), (__v4di)__W); } @@ -647,10 +724,16 @@ _mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_or_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a | (__v2du)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_or_si128(__A, __B), + (__v2di)_mm_or_epi64(__A, __B), (__v2di)__W); } @@ -661,10 +744,16 @@ _mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B) } static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_xor_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a ^ (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_xor_si256(__A, __B), + (__v4di)_mm256_xor_epi64(__A, __B), (__v4di)__W); } @@ -675,11 +764,17 @@ _mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_xor_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a ^ (__v2du)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_xor_si128(__A, __B), + (__v2di)_mm_xor_epi64(__A, __B), (__v2di)__W); } @@ -3389,162 +3484,162 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { } #define _mm_i64scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)-1, \ (__v2di)(__m128i)(index), \ (__v2df)(__m128d)(v1), (int)(scale)) #define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)(mask), \ (__v2di)(__m128i)(index), \ (__v2df)(__m128d)(v1), (int)(scale)) #define _mm_i64scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)-1, \ (__v2di)(__m128i)(index), \ (__v2di)(__m128i)(v1), (int)(scale)) #define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)(mask), \ (__v2di)(__m128i)(index), \ (__v2di)(__m128i)(v1), (int)(scale)) #define _mm256_i64scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)-1, \ (__v4di)(__m256i)(index), \ (__v4df)(__m256d)(v1), (int)(scale)) #define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)(mask), \ (__v4di)(__m256i)(index), \ (__v4df)(__m256d)(v1), (int)(scale)) #define _mm256_i64scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)-1, \ (__v4di)(__m256i)(index), \ (__v4di)(__m256i)(v1), (int)(scale)) #define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)(mask), \ (__v4di)(__m256i)(index), \ (__v4di)(__m256i)(v1), (int)(scale)) #define _mm_i64scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)-1, \ (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \ (int)(scale)) #define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)(mask), \ (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \ (int)(scale)) #define _mm_i64scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)-1, \ (__v2di)(__m128i)(index), \ (__v4si)(__m128i)(v1), (int)(scale)) #define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)(mask), \ (__v2di)(__m128i)(index), \ (__v4si)(__m128i)(v1), (int)(scale)) #define _mm256_i64scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)-1, \ (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \ (int)(scale)) #define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)(mask), \ (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \ (int)(scale)) #define _mm256_i64scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)-1, \ + __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)-1, \ (__v4di)(__m256i)(index), \ (__v4si)(__m128i)(v1), (int)(scale)) #define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)(mask), \ (__v4di)(__m256i)(index), \ (__v4si)(__m128i)(v1), (int)(scale)) #define _mm_i32scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)-1, \ (__v4si)(__m128i)(index), \ (__v2df)(__m128d)(v1), (int)(scale)) #define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)(mask), \ (__v4si)(__m128i)(index), \ (__v2df)(__m128d)(v1), (int)(scale)) #define _mm_i32scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)-1, \ (__v4si)(__m128i)(index), \ (__v2di)(__m128i)(v1), (int)(scale)) #define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)(mask), \ (__v4si)(__m128i)(index), \ (__v2di)(__m128i)(v1), (int)(scale)) #define _mm256_i32scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)-1, \ (__v4si)(__m128i)(index), \ (__v4df)(__m256d)(v1), (int)(scale)) #define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)(mask), \ (__v4si)(__m128i)(index), \ (__v4df)(__m256d)(v1), (int)(scale)) #define _mm256_i32scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)-1, \ (__v4si)(__m128i)(index), \ (__v4di)(__m256i)(v1), (int)(scale)) #define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)(mask), \ (__v4si)(__m128i)(index), \ (__v4di)(__m256i)(v1), (int)(scale)) #define _mm_i32scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)-1, \ (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ (int)(scale)) #define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)(mask), \ (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ (int)(scale)) #define _mm_i32scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)-1, \ (__v4si)(__m128i)(index), \ (__v4si)(__m128i)(v1), (int)(scale)) #define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)(mask), \ (__v4si)(__m128i)(index), \ (__v4si)(__m128i)(v1), (int)(scale)) #define _mm256_i32scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)-1, \ (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ (int)(scale)) #define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)(mask), \ (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ (int)(scale)) #define _mm256_i32scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)-1, \ + __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)-1, \ (__v8si)(__m256i)(index), \ (__v8si)(__m256i)(v1), (int)(scale)) #define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)(mask), \ + __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)(mask), \ (__v8si)(__m256i)(index), \ (__v8si)(__m256i)(v1), (int)(scale)) @@ -4989,6 +5084,12 @@ _mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) (__v8si) _mm256_setzero_si256 ()); } +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_load_epi32 (void const *__P) +{ + return *(__m128i *) __P; +} + static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) { @@ -5008,6 +5109,12 @@ _mm_maskz_load_epi32 (__mmask8 __U, void const *__P) __U); } +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_load_epi32 (void const *__P) +{ + return *(__m256i *) __P; +} + static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) { @@ -5027,6 +5134,12 @@ _mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) __U); } +static __inline void __DEFAULT_FN_ATTRS128 +_mm_store_epi32 (void *__P, __m128i __A) +{ + *(__m128i *) __P = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) { @@ -5035,6 +5148,12 @@ _mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_store_epi32 (void *__P, __m256i __A) +{ + *(__m256i *) __P = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) { @@ -5075,6 +5194,12 @@ _mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) (__v4di) _mm256_setzero_si256 ()); } +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_load_epi64 (void const *__P) +{ + return *(__m128i *) __P; +} + static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) { @@ -5094,6 +5219,12 @@ _mm_maskz_load_epi64 (__mmask8 __U, void const *__P) __U); } +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_load_epi64 (void const *__P) +{ + return *(__m256i *) __P; +} + static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) { @@ -5113,6 +5244,12 @@ _mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) __U); } +static __inline void __DEFAULT_FN_ATTRS128 +_mm_store_epi64 (void *__P, __m128i __A) +{ + *(__m128i *) __P = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) { @@ -5121,6 +5258,12 @@ _mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_store_epi64 (void *__P, __m256i __A) +{ + *(__m256i *) __P = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) { @@ -5366,6 +5509,15 @@ _mm256_maskz_load_ps (__mmask8 __U, void const *__P) (__mmask8) __U); } +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_loadu_epi64 (void const *__P) +{ + struct __loadu_epi64 { + __m128i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi64*)__P)->__v; +} + static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) { @@ -5383,6 +5535,15 @@ _mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) (__mmask8) __U); } +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadu_epi64 (void const *__P) +{ + struct __loadu_epi64 { + __m256i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi64*)__P)->__v; +} + static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) { @@ -5400,6 +5561,15 @@ _mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) (__mmask8) __U); } +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_loadu_epi32 (void const *__P) +{ + struct __loadu_epi32 { + __m128i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi32*)__P)->__v; +} + static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) { @@ -5417,6 +5587,15 @@ _mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) (__mmask8) __U); } +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadu_epi32 (void const *__P) +{ + struct __loadu_epi32 { + __m256i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_epi32*)__P)->__v; +} + static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) { @@ -5534,6 +5713,15 @@ _mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) (__mmask8) __U); } +static __inline void __DEFAULT_FN_ATTRS128 +_mm_storeu_epi64 (void *__P, __m128i __A) +{ + struct __storeu_epi64 { + __m128i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi64*)__P)->__v = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) { @@ -5542,6 +5730,15 @@ _mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_storeu_epi64 (void *__P, __m256i __A) +{ + struct __storeu_epi64 { + __m256i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi64*)__P)->__v = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) { @@ -5550,6 +5747,15 @@ _mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) (__mmask8) __U); } +static __inline void __DEFAULT_FN_ATTRS128 +_mm_storeu_epi32 (void *__P, __m128i __A) +{ + struct __storeu_epi32 { + __m128i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi32*)__P)->__v = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) { @@ -5558,6 +5764,15 @@ _mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_storeu_epi32 (void *__P, __m256i __A) +{ + struct __storeu_epi32 { + __m256i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi32*)__P)->__v = __A; +} + static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) { @@ -7769,97 +7984,97 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) #define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ (__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \ - (double const *)(addr), \ + (void const *)(addr), \ (__v2di)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \ - (long long const *)(addr), \ + (void const *)(addr), \ (__v2di)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ (__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \ - (double const *)(addr), \ + (void const *)(addr), \ (__v4di)(__m256i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ (__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \ - (long long const *)(addr), \ + (void const *)(addr), \ (__v4di)(__m256i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ (__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \ - (float const *)(addr), \ + (void const *)(addr), \ (__v2di)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \ - (int const *)(addr), \ + (void const *)(addr), \ (__v2di)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ (__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \ - (float const *)(addr), \ + (void const *)(addr), \ (__v4di)(__m256i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \ - (int const *)(addr), \ + (void const *)(addr), \ (__v4di)(__m256i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ (__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \ - (double const *)(addr), \ + (void const *)(addr), \ (__v4si)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \ - (long long const *)(addr), \ + (void const *)(addr), \ (__v4si)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ (__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \ - (double const *)(addr), \ + (void const *)(addr), \ (__v4si)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ (__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \ - (long long const *)(addr), \ + (void const *)(addr), \ (__v4si)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ (__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \ - (float const *)(addr), \ + (void const *)(addr), \ (__v4si)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \ - (int const *)(addr), \ + (void const *)(addr), \ (__v4si)(__m128i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ (__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \ - (float const *)(addr), \ + (void const *)(addr), \ (__v8si)(__m256i)(index), \ (__mmask8)(mask), (int)(scale)) #define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ (__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \ - (int const *)(addr), \ + (void const *)(addr), \ (__v8si)(__m256i)(index), \ (__mmask8)(mask), (int)(scale)) diff --git a/lib/Headers/avx512vlvbmi2intrin.h b/lib/Headers/avx512vlvbmi2intrin.h index baaf5654631c..632d14fb55aa 100644 --- a/lib/Headers/avx512vlvbmi2intrin.h +++ b/lib/Headers/avx512vlvbmi2intrin.h @@ -421,327 +421,279 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__v8hi)_mm_setzero_si128()) static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S, - (__v4di) __A, - (__v4di) __B, - __U); + return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B, + (__v4di)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S, - (__v4di) __A, - (__v4di) __B, - __U); + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_shldv_epi64(__A, __B, __C), + (__v4di)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B) +_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S, - (__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_shldv_epi64(__A, __B, __C), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S, - (__v2di) __A, - (__v2di) __B, - __U); + return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B, + (__v2di)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S, - (__v2di) __A, - (__v2di) __B, - __U); + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_shldv_epi64(__A, __B, __C), + (__v2di)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shldv_epi64(__m128i __S, __m128i __A, __m128i __B) +_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S, - (__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_shldv_epi64(__A, __B, __C), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - __U); + return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B, + (__v8si)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_shldv_epi32(__A, __B, __C), + (__v8si)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B) +_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_shldv_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - __U); + return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B, + (__v4si)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_shldv_epi32(__A, __B, __C), + (__v4si)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shldv_epi32(__m128i __S, __m128i __A, __m128i __B) +_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_shldv_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) +_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S, - (__v16hi) __A, - (__v16hi) __B, - __U); + return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B, + (__v16hi)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) +_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S, - (__v16hi) __A, - (__v16hi) __B, - __U); + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_shldv_epi16(__A, __B, __C), + (__v16hi)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B) +_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S, - (__v16hi) __A, - (__v16hi) __B, - (__mmask16) -1); + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_shldv_epi16(__A, __B, __C), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S, - (__v8hi) __A, - (__v8hi) __B, - __U); + return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B, + (__v8hi)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S, - (__v8hi) __A, - (__v8hi) __B, - __U); + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_shldv_epi16(__A, __B, __C), + (__v8hi)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shldv_epi16(__m128i __S, __m128i __A, __m128i __B) +_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S, - (__v8hi) __A, - (__v8hi) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_shldv_epi16(__A, __B, __C), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S, - (__v4di) __A, - (__v4di) __B, - __U); + return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B, + (__v4di)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S, - (__v4di) __A, - (__v4di) __B, - __U); + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_shrdv_epi64(__A, __B, __C), + (__v4di)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B) +_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S, - (__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_shrdv_epi64(__A, __B, __C), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S, - (__v2di) __A, - (__v2di) __B, - __U); + return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B, + (__v2di)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S, - (__v2di) __A, - (__v2di) __B, - __U); + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_shrdv_epi64(__A, __B, __C), + (__v2di)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B) +_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S, - (__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_shrdv_epi64(__A, __B, __C), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - __U); + return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B, + (__v8si)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_shrdv_epi32(__A, __B, __C), + (__v8si)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B) +_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_shrdv_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - __U); + return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B, + (__v4si)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_shrdv_epi32(__A, __B, __C), + (__v4si)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B) +_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_shrdv_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) +_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S, - (__v16hi) __A, - (__v16hi) __B, - __U); + return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B, + (__v16hi)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) +_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S, - (__v16hi) __A, - (__v16hi) __B, - __U); + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), + (__v16hi)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B) +_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S, - (__v16hi) __A, - (__v16hi) __B, - (__mmask16) -1); + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S, - (__v8hi) __A, - (__v8hi) __B, - __U); + return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B, + (__v8hi)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S, - (__v8hi) __A, - (__v8hi) __B, - __U); + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_shrdv_epi16(__A, __B, __C), + (__v8hi)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B) +_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S, - (__v8hi) __A, - (__v8hi) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_shrdv_epi16(__A, __B, __C), + (__v8hi)_mm_setzero_si128()); } diff --git a/lib/Headers/bmiintrin.h b/lib/Headers/bmiintrin.h index d03bef442a28..56c20b78d340 100644 --- a/lib/Headers/bmiintrin.h +++ b/lib/Headers/bmiintrin.h @@ -62,7 +62,7 @@ static __inline__ unsigned short __RELAXED_FN_ATTRS __tzcnt_u16(unsigned short __X) { - return __X ? __builtin_ctzs(__X) : 16; + return __builtin_ia32_tzcnt_u16(__X); } /// Performs a bitwise AND of the second operand with the one's @@ -196,7 +196,7 @@ __blsr_u32(unsigned int __X) static __inline__ unsigned int __RELAXED_FN_ATTRS __tzcnt_u32(unsigned int __X) { - return __X ? __builtin_ctz(__X) : 32; + return __builtin_ia32_tzcnt_u32(__X); } /// Counts the number of trailing zero bits in the operand. @@ -212,7 +212,7 @@ __tzcnt_u32(unsigned int __X) static __inline__ int __RELAXED_FN_ATTRS _mm_tzcnt_32(unsigned int __X) { - return __X ? __builtin_ctz(__X) : 32; + return __builtin_ia32_tzcnt_u32(__X); } #ifdef __x86_64__ @@ -359,7 +359,7 @@ __blsr_u64(unsigned long long __X) static __inline__ unsigned long long __RELAXED_FN_ATTRS __tzcnt_u64(unsigned long long __X) { - return __X ? __builtin_ctzll(__X) : 64; + return __builtin_ia32_tzcnt_u64(__X); } /// Counts the number of trailing zero bits in the operand. @@ -375,7 +375,7 @@ __tzcnt_u64(unsigned long long __X) static __inline__ long long __RELAXED_FN_ATTRS _mm_tzcnt_64(unsigned long long __X) { - return __X ? __builtin_ctzll(__X) : 64; + return __builtin_ia32_tzcnt_u64(__X); } #endif /* __x86_64__ */ diff --git a/lib/Headers/cuda_wrappers/new b/lib/Headers/cuda_wrappers/new index 71b7a52363ce..f49811c5a57c 100644 --- a/lib/Headers/cuda_wrappers/new +++ b/lib/Headers/cuda_wrappers/new @@ -73,10 +73,12 @@ __device__ inline void operator delete[](void *ptr, // Sized delete, C++14 only. #if __cplusplus >= 201402L -__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT { +__device__ inline void operator delete(void *ptr, + __SIZE_TYPE__ size) CUDA_NOEXCEPT { ::operator delete(ptr); } -__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT { +__device__ inline void operator delete[](void *ptr, + __SIZE_TYPE__ size) CUDA_NOEXCEPT { ::operator delete(ptr); } #endif diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h index f0ea7cd05c63..6d61f9719944 100644 --- a/lib/Headers/emmintrin.h +++ b/lib/Headers/emmintrin.h @@ -1675,7 +1675,49 @@ _mm_loadu_si64(void const *__a) long long __v; } __attribute__((__packed__, __may_alias__)); long long __u = ((struct __loadu_si64*)__a)->__v; - return __extension__ (__m128i)(__v2di){__u, 0L}; + return __extension__ (__m128i)(__v2di){__u, 0LL}; +} + +/// Loads a 32-bit integer value to the low element of a 128-bit integer +/// vector and clears the upper element. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. +/// +/// \param __a +/// A pointer to a 32-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [4 x i32] containing the loaded value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadu_si32(void const *__a) +{ + struct __loadu_si32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + int __u = ((struct __loadu_si32*)__a)->__v; + return __extension__ (__m128i)(__v4si){__u, 0, 0, 0}; +} + +/// Loads a 16-bit integer value to the low element of a 128-bit integer +/// vector and clears the upper element. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __a +/// A pointer to a 16-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [8 x i16] containing the loaded value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadu_si16(void const *__a) +{ + struct __loadu_si16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + short __u = ((struct __loadu_si16*)__a)->__v; + return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; } /// Loads a 64-bit double-precision value to the low element of a @@ -3993,6 +4035,69 @@ _mm_storeu_si128(__m128i *__p, __m128i __b) ((struct __storeu_si128*)__p)->__v = __b; } +/// Stores a 64-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. +/// +/// \param __p +/// A pointer to a 64-bit memory location. The address of the memory +/// location does not have to be algned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si64(void const *__p, __m128i __b) +{ + struct __storeu_si64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0]; +} + +/// Stores a 32-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. +/// +/// \param __p +/// A pointer to a 32-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si32(void const *__p, __m128i __b) +{ + struct __storeu_si32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0]; +} + +/// Stores a 16-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __p +/// A pointer to a 16-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si16(void const *__p, __m128i __b) +{ + struct __storeu_si16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0]; +} + /// Moves bytes selected by the mask from the first operand to the /// specified unaligned memory location. When a mask bit is 1, the /// corresponding byte is written, otherwise it is not written. diff --git a/lib/Headers/float.h b/lib/Headers/float.h index 44d4d05494f5..56215cd624d7 100644 --- a/lib/Headers/float.h +++ b/lib/Headers/float.h @@ -21,8 +21,8 @@ *===-----------------------------------------------------------------------=== */ -#ifndef __FLOAT_H -#define __FLOAT_H +#ifndef __CLANG_FLOAT_H +#define __CLANG_FLOAT_H /* If we're on MinGW, fall back to the system's float.h, which might have * additional definitions provided for Windows. @@ -85,6 +85,9 @@ # undef FLT_DECIMAL_DIG # undef DBL_DECIMAL_DIG # undef LDBL_DECIMAL_DIG +# undef FLT_HAS_SUBNORM +# undef DBL_HAS_SUBNORM +# undef LDBL_HAS_SUBNORM # endif #endif @@ -141,6 +144,9 @@ # define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__ # define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__ # define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__ +# define FLT_HAS_SUBNORM __FLT_HAS_DENORM__ +# define DBL_HAS_SUBNORM __DBL_HAS_DENORM__ +# define LDBL_HAS_SUBNORM __LDBL_HAS_DENORM__ #endif #ifdef __STDC_WANT_IEC_60559_TYPES_EXT__ @@ -157,4 +163,4 @@ # define FLT16_TRUE_MIN __FLT16_TRUE_MIN__ #endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */ -#endif /* __FLOAT_H */ +#endif /* __CLANG_FLOAT_H */ diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h index e7bfbf964d56..7d0722ec7652 100644 --- a/lib/Headers/immintrin.h +++ b/lib/Headers/immintrin.h @@ -306,6 +306,65 @@ _writegsbase_u64(unsigned long long __V) #endif #endif /* __FSGSBASE__ */ +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MOVBE__) + +/* The structs used below are to force the load/store to be unaligned. This + * is accomplished with the __packed__ attribute. The __may_alias__ prevents + * tbaa metadata from being generated based on the struct and the type of the + * field inside of it. + */ + +static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i16(void const * __P) { + struct __loadu_i16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap16(((struct __loadu_i16*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i16(void * __P, short __D) { + struct __storeu_i16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D); +} + +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i32(void const * __P) { + struct __loadu_i32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap32(((struct __loadu_i32*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i32(void * __P, int __D) { + struct __storeu_i32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D); +} + +#ifdef __x86_64__ +static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i64(void const * __P) { + struct __loadu_i64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap64(((struct __loadu_i64*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i64(void * __P, long long __D) { + struct __storeu_i64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D); +} +#endif +#endif /* __MOVBE */ + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__) #include <rtmintrin.h> #include <xtestintrin.h> diff --git a/lib/Headers/intrin.h b/lib/Headers/intrin.h index 91914214e299..c86f41faeb88 100644 --- a/lib/Headers/intrin.h +++ b/lib/Headers/intrin.h @@ -90,8 +90,6 @@ void __inwordstring(unsigned short, unsigned short *, unsigned long); void __lidt(void *); unsigned __int64 __ll_lshift(unsigned __int64, int); __int64 __ll_rshift(__int64, int); -unsigned int __lzcnt(unsigned int); -unsigned short __lzcnt16(unsigned short); static __inline__ void __movsb(unsigned char *, unsigned char const *, size_t); static __inline__ @@ -219,7 +217,6 @@ void __incgsbyte(unsigned long); void __incgsdword(unsigned long); void __incgsqword(unsigned long); void __incgsword(unsigned long); -unsigned __int64 __lzcnt64(unsigned __int64); static __inline__ void __movsq(unsigned long long *, unsigned long long const *, size_t); static __inline__ @@ -329,189 +326,63 @@ __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask); |* Interlocked Exchange Add \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE); -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE); -} +char _InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value); +char _InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value); +char _InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value); +short _InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value); +short _InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value); +short _InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value); +long _InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value); +long _InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value); +long _InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value); +__int64 _InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value); +__int64 _InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value); +__int64 _InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value); #endif /*----------------------------------------------------------------------------*\ |* Interlocked Increment \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedIncrement16_acq(short volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedIncrement16_nf(short volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedIncrement16_rel(short volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedIncrement_acq(long volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedIncrement_nf(long volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedIncrement_rel(long volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedIncrement64_acq(__int64 volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedIncrement64_nf(__int64 volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedIncrement64_rel(__int64 volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE); -} +short _InterlockedIncrement16_acq(short volatile *_Value); +short _InterlockedIncrement16_nf(short volatile *_Value); +short _InterlockedIncrement16_rel(short volatile *_Value); +long _InterlockedIncrement_acq(long volatile *_Value); +long _InterlockedIncrement_nf(long volatile *_Value); +long _InterlockedIncrement_rel(long volatile *_Value); +__int64 _InterlockedIncrement64_acq(__int64 volatile *_Value); +__int64 _InterlockedIncrement64_nf(__int64 volatile *_Value); +__int64 _InterlockedIncrement64_rel(__int64 volatile *_Value); #endif /*----------------------------------------------------------------------------*\ |* Interlocked Decrement \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedDecrement16_acq(short volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedDecrement16_nf(short volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedDecrement16_rel(short volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedDecrement_acq(long volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedDecrement_nf(long volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedDecrement_rel(long volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedDecrement64_acq(__int64 volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedDecrement64_nf(__int64 volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedDecrement64_rel(__int64 volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE); -} +short _InterlockedDecrement16_acq(short volatile *_Value); +short _InterlockedDecrement16_nf(short volatile *_Value); +short _InterlockedDecrement16_rel(short volatile *_Value); +long _InterlockedDecrement_acq(long volatile *_Value); +long _InterlockedDecrement_nf(long volatile *_Value); +long _InterlockedDecrement_rel(long volatile *_Value); +__int64 _InterlockedDecrement64_acq(__int64 volatile *_Value); +__int64 _InterlockedDecrement64_nf(__int64 volatile *_Value); +__int64 _InterlockedDecrement64_rel(__int64 volatile *_Value); #endif /*----------------------------------------------------------------------------*\ |* Interlocked And \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedAnd8_acq(char volatile *_Value, char _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedAnd8_nf(char volatile *_Value, char _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedAnd8_rel(char volatile *_Value, char _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedAnd16_acq(short volatile *_Value, short _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedAnd16_nf(short volatile *_Value, short _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedAnd16_rel(short volatile *_Value, short _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedAnd_acq(long volatile *_Value, long _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedAnd_nf(long volatile *_Value, long _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedAnd_rel(long volatile *_Value, long _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE); -} +char _InterlockedAnd8_acq(char volatile *_Value, char _Mask); +char _InterlockedAnd8_nf(char volatile *_Value, char _Mask); +char _InterlockedAnd8_rel(char volatile *_Value, char _Mask); +short _InterlockedAnd16_acq(short volatile *_Value, short _Mask); +short _InterlockedAnd16_nf(short volatile *_Value, short _Mask); +short _InterlockedAnd16_rel(short volatile *_Value, short _Mask); +long _InterlockedAnd_acq(long volatile *_Value, long _Mask); +long _InterlockedAnd_nf(long volatile *_Value, long _Mask); +long _InterlockedAnd_rel(long volatile *_Value, long _Mask); +__int64 _InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask); +__int64 _InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask); +__int64 _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask); #endif /*----------------------------------------------------------------------------*\ |* Bit Counting and Testing @@ -534,261 +405,81 @@ unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase, |* Interlocked Or \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedOr8_acq(char volatile *_Value, char _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedOr8_nf(char volatile *_Value, char _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedOr8_rel(char volatile *_Value, char _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedOr16_acq(short volatile *_Value, short _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedOr16_nf(short volatile *_Value, short _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedOr16_rel(short volatile *_Value, short _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedOr_acq(long volatile *_Value, long _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedOr_nf(long volatile *_Value, long _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedOr_rel(long volatile *_Value, long _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE); -} +char _InterlockedOr8_acq(char volatile *_Value, char _Mask); +char _InterlockedOr8_nf(char volatile *_Value, char _Mask); +char _InterlockedOr8_rel(char volatile *_Value, char _Mask); +short _InterlockedOr16_acq(short volatile *_Value, short _Mask); +short _InterlockedOr16_nf(short volatile *_Value, short _Mask); +short _InterlockedOr16_rel(short volatile *_Value, short _Mask); +long _InterlockedOr_acq(long volatile *_Value, long _Mask); +long _InterlockedOr_nf(long volatile *_Value, long _Mask); +long _InterlockedOr_rel(long volatile *_Value, long _Mask); +__int64 _InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask); +__int64 _InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask); +__int64 _InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask); #endif /*----------------------------------------------------------------------------*\ |* Interlocked Xor \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedXor8_acq(char volatile *_Value, char _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedXor8_nf(char volatile *_Value, char _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedXor8_rel(char volatile *_Value, char _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedXor16_acq(short volatile *_Value, short _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedXor16_nf(short volatile *_Value, short _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedXor16_rel(short volatile *_Value, short _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedXor_acq(long volatile *_Value, long _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedXor_nf(long volatile *_Value, long _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedXor_rel(long volatile *_Value, long _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED); -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE); -} +char _InterlockedXor8_acq(char volatile *_Value, char _Mask); +char _InterlockedXor8_nf(char volatile *_Value, char _Mask); +char _InterlockedXor8_rel(char volatile *_Value, char _Mask); +short _InterlockedXor16_acq(short volatile *_Value, short _Mask); +short _InterlockedXor16_nf(short volatile *_Value, short _Mask); +short _InterlockedXor16_rel(short volatile *_Value, short _Mask); +long _InterlockedXor_acq(long volatile *_Value, long _Mask); +long _InterlockedXor_nf(long volatile *_Value, long _Mask); +long _InterlockedXor_rel(long volatile *_Value, long _Mask); +__int64 _InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask); +__int64 _InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask); +__int64 _InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask); #endif /*----------------------------------------------------------------------------*\ |* Interlocked Exchange \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchange8_acq(char volatile *_Target, char _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE); - return _Value; -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchange8_nf(char volatile *_Target, char _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED); - return _Value; -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchange8_rel(char volatile *_Target, char _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE); - return _Value; -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchange16_acq(short volatile *_Target, short _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE); - return _Value; -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchange16_nf(short volatile *_Target, short _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED); - return _Value; -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchange16_rel(short volatile *_Target, short _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE); - return _Value; -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchange_acq(long volatile *_Target, long _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE); - return _Value; -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchange_nf(long volatile *_Target, long _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED); - return _Value; -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchange_rel(long volatile *_Target, long _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE); - return _Value; -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE); - return _Value; -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED); - return _Value; -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE); - return _Value; -} +char _InterlockedExchange8_acq(char volatile *_Target, char _Value); +char _InterlockedExchange8_nf(char volatile *_Target, char _Value); +char _InterlockedExchange8_rel(char volatile *_Target, char _Value); +short _InterlockedExchange16_acq(short volatile *_Target, short _Value); +short _InterlockedExchange16_nf(short volatile *_Target, short _Value); +short _InterlockedExchange16_rel(short volatile *_Target, short _Value); +long _InterlockedExchange_acq(long volatile *_Target, long _Value); +long _InterlockedExchange_nf(long volatile *_Target, long _Value); +long _InterlockedExchange_rel(long volatile *_Target, long _Value); +__int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value); +__int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value); +__int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value); #endif /*----------------------------------------------------------------------------*\ |* Interlocked Compare Exchange \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedCompareExchange8_acq(char volatile *_Destination, - char _Exchange, char _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); - return _Comparand; -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedCompareExchange8_nf(char volatile *_Destination, - char _Exchange, char _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return _Comparand; -} -static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedCompareExchange8_rel(char volatile *_Destination, - char _Exchange, char _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_RELEASE); - return _Comparand; -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedCompareExchange16_acq(short volatile *_Destination, - short _Exchange, short _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); - return _Comparand; -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedCompareExchange16_nf(short volatile *_Destination, - short _Exchange, short _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return _Comparand; -} -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedCompareExchange16_rel(short volatile *_Destination, - short _Exchange, short _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_RELEASE); - return _Comparand; -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedCompareExchange_acq(long volatile *_Destination, - long _Exchange, long _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); - return _Comparand; -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedCompareExchange_nf(long volatile *_Destination, - long _Exchange, long _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return _Comparand; -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedCompareExchange_rel(long volatile *_Destination, - long _Exchange, long _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_RELEASE); - return _Comparand; -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedCompareExchange64_acq(__int64 volatile *_Destination, - __int64 _Exchange, __int64 _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); - return _Comparand; -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedCompareExchange64_nf(__int64 volatile *_Destination, - __int64 _Exchange, __int64 _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return _Comparand; -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedCompareExchange64_rel(__int64 volatile *_Destination, - __int64 _Exchange, __int64 _Comparand) { - __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_RELEASE); - return _Comparand; -} +char _InterlockedCompareExchange8_acq(char volatile *_Destination, + char _Exchange, char _Comparand); +char _InterlockedCompareExchange8_nf(char volatile *_Destination, + char _Exchange, char _Comparand); +char _InterlockedCompareExchange8_rel(char volatile *_Destination, + char _Exchange, char _Comparand); +short _InterlockedCompareExchange16_acq(short volatile *_Destination, + short _Exchange, short _Comparand); +short _InterlockedCompareExchange16_nf(short volatile *_Destination, + short _Exchange, short _Comparand); +short _InterlockedCompareExchange16_rel(short volatile *_Destination, + short _Exchange, short _Comparand); +long _InterlockedCompareExchange_acq(long volatile *_Destination, + long _Exchange, long _Comparand); +long _InterlockedCompareExchange_nf(long volatile *_Destination, + long _Exchange, long _Comparand); +long _InterlockedCompareExchange_rel(long volatile *_Destination, + long _Exchange, long _Comparand); +__int64 _InterlockedCompareExchange64_acq(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand); +__int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand); +__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand); #endif /*----------------------------------------------------------------------------*\ @@ -841,7 +532,7 @@ __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) { static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) { __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3]) - : "a"(__level)); + : "a"(__level), "c"(0)); } static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level, int __ecx) { @@ -858,23 +549,32 @@ static __inline__ void __DEFAULT_FN_ATTRS __halt(void) { __asm__ volatile ("hlt"); } +#endif + +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) static __inline__ void __DEFAULT_FN_ATTRS __nop(void) { __asm__ volatile ("nop"); } #endif -#if defined(__x86_64__) -static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS -__shiftleft128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) { - unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l; - unsigned __int128 __res = __val << (__d & 63); - return (unsigned __int64)(__res >> 64); + +/*----------------------------------------------------------------------------*\ +|* MS AArch64 specific +\*----------------------------------------------------------------------------*/ +#if defined(__aarch64__) +unsigned __int64 __getReg(int); +long _InterlockedAdd(long volatile *Addend, long Value); +int _ReadStatusReg(int); +void _WriteStatusReg(int, int); + +static inline unsigned short _byteswap_ushort (unsigned short val) { + return __builtin_bswap16(val); } -static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS -__shiftright128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) { - unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l; - unsigned __int128 __res = __val >> (__d & 63); - return (unsigned __int64)__res; +static inline unsigned long _byteswap_ulong (unsigned long val) { + return __builtin_bswap32(val); +} +static inline unsigned __int64 _byteswap_uint64 (unsigned __int64 val) { + return __builtin_bswap64(val); } #endif diff --git a/lib/Headers/lzcntintrin.h b/lib/Headers/lzcntintrin.h index 558f1828f0e7..35c1651cc4a8 100644 --- a/lib/Headers/lzcntintrin.h +++ b/lib/Headers/lzcntintrin.h @@ -31,6 +31,7 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) +#ifndef _MSC_VER /// Counts the number of leading zero bits in the operand. /// /// \headerfile <x86intrin.h> @@ -41,11 +42,8 @@ /// An unsigned 16-bit integer whose leading zeros are to be counted. /// \returns An unsigned 16-bit integer containing the number of leading zero /// bits in the operand. -static __inline__ unsigned short __DEFAULT_FN_ATTRS -__lzcnt16(unsigned short __X) -{ - return __X ? __builtin_clzs(__X) : 16; -} +#define __lzcnt16(X) __builtin_ia32_lzcnt_u16((unsigned short)(X)) +#endif // _MSC_VER /// Counts the number of leading zero bits in the operand. /// @@ -61,7 +59,7 @@ __lzcnt16(unsigned short __X) static __inline__ unsigned int __DEFAULT_FN_ATTRS __lzcnt32(unsigned int __X) { - return __X ? __builtin_clz(__X) : 32; + return __builtin_ia32_lzcnt_u32(__X); } /// Counts the number of leading zero bits in the operand. @@ -78,10 +76,11 @@ __lzcnt32(unsigned int __X) static __inline__ unsigned int __DEFAULT_FN_ATTRS _lzcnt_u32(unsigned int __X) { - return __X ? __builtin_clz(__X) : 32; + return __builtin_ia32_lzcnt_u32(__X); } #ifdef __x86_64__ +#ifndef _MSC_VER /// Counts the number of leading zero bits in the operand. /// /// \headerfile <x86intrin.h> @@ -93,11 +92,8 @@ _lzcnt_u32(unsigned int __X) /// \returns An unsigned 64-bit integer containing the number of leading zero /// bits in the operand. /// \see _lzcnt_u64 -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__lzcnt64(unsigned long long __X) -{ - return __X ? __builtin_clzll(__X) : 64; -} +#define __lzcnt64(X) __builtin_ia32_lzcnt_u64((unsigned long long)(X)) +#endif // _MSC_VER /// Counts the number of leading zero bits in the operand. /// @@ -113,7 +109,7 @@ __lzcnt64(unsigned long long __X) static __inline__ unsigned long long __DEFAULT_FN_ATTRS _lzcnt_u64(unsigned long long __X) { - return __X ? __builtin_clzll(__X) : 64; + return __builtin_ia32_lzcnt_u64(__X); } #endif diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h index e481c792df71..160bae807174 100644 --- a/lib/Headers/opencl-c.h +++ b/lib/Headers/opencl-c.h @@ -22,6 +22,14 @@ #endif //cl_khr_3d_image_writes #endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0 +#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 +#ifndef cl_intel_planar_yuv +#define cl_intel_planar_yuv +#endif // cl_intel_planar_yuv +#pragma OPENCL EXTENSION cl_intel_planar_yuv : begin +#pragma OPENCL EXTENSION cl_intel_planar_yuv : end +#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2 + #define __ovld __attribute__((overloadable)) #define __conv __attribute__((convergent)) @@ -14462,7 +14470,7 @@ half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask); #if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf -int printf(__constant const char* st, ...); +int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); #endif // OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions @@ -14602,6 +14610,7 @@ int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, f uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, int4 coord); uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord); +#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, int4 coord); float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord); @@ -14609,6 +14618,7 @@ int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_ int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord); uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, int4 coord); uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord); +#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2 float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, int coord); float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord); @@ -14618,6 +14628,7 @@ int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, f uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, int coord); uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord); +#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, int2 coord); float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord); @@ -14625,6 +14636,7 @@ int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_ int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord); uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, int2 coord); uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord); +#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2 #ifdef cl_khr_depth_images float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord); @@ -14727,6 +14739,8 @@ uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, #endif //cl_khr_mipmap_image #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 +#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 + /** * Sampler-less Image Access */ @@ -14760,24 +14774,31 @@ float4 __purefn __ovld read_imagef(read_only image3d_t image, int4 coord); int4 __purefn __ovld read_imagei(read_only image3d_t image, int4 coord); uint4 __purefn __ovld read_imageui(read_only image3d_t image, int4 coord); +#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2 + // Image read functions returning half4 type #ifdef cl_khr_fp16 half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, int coord); half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, float coord); -half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, int2 coord); -half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, float2 coord); half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, int2 coord); half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, float2 coord); half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, int4 coord); half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, float4 coord); +#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 +half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, int2 coord); +half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, float2 coord); half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, int4 coord); half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, float4 coord); +/** + * Sampler-less Image Access + */ half4 __purefn __ovld read_imageh(read_only image1d_t image, int coord); half4 __purefn __ovld read_imageh(read_only image2d_t image, int2 coord); half4 __purefn __ovld read_imageh(read_only image3d_t image, int4 coord); half4 __purefn __ovld read_imageh(read_only image1d_array_t image, int2 coord); half4 __purefn __ovld read_imageh(read_only image2d_array_t image, int4 coord); half4 __purefn __ovld read_imageh(read_only image1d_buffer_t image, int coord); +#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2 #endif //cl_khr_fp16 // Image read functions for read_write images @@ -15707,7 +15728,6 @@ double __ovld __conv work_group_scan_inclusive_max(double x); // OpenCL v2.0 s6.13.16 - Pipe Functions #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -#define PIPE_RESERVE_ID_VALID_BIT (1U << 30) #define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t)) bool __ovld is_valid_reserve_id(reserve_id_t reserve_id); #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -16193,6 +16213,637 @@ void __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, u void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data ); #endif // cl_intel_subgroups_short +#ifdef cl_intel_device_side_avc_motion_estimation +#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin + +#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0 +#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1 +#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2 +#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3 + +#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0 +#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1 +#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2 +#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3 + +#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0 +#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 +#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 + +#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 +#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E +#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D +#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B +#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 +#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F +#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F +#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F + +#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 +#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 +#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 + +#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 +#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 +#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 +#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 +#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 +#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 +#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 +#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 +#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 + +#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 + +#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 + +#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 + +#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 +#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 +#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 +#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B +#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 + +#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 +#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 +#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 +#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 + +#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0 +#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1 +#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2 + +#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 +#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 + +#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30) + +#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 +#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 + +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 + +#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 + +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1 +#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2 +#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3 + +#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 +#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 + +#define CLK_AVC_ME_INITIALIZE_INTEL 0x0 + +#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0 +#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0 +#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0 + +#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0 +#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0 +#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0 + +#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0 +#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0 +#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0 +#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0 + +// MCE built-in functions +uchar __ovld +intel_sub_group_avc_mce_get_default_inter_base_multi_reference_penalty( + uchar slice_type, uchar qp); +ulong __ovld intel_sub_group_avc_mce_get_default_inter_shape_penalty( + uchar slice_type, uchar qp); +uchar __ovld intel_sub_group_avc_mce_get_default_inter_direction_penalty( + uchar slice_type, uchar qp); +uint __ovld intel_sub_group_avc_mce_get_default_intra_luma_shape_penalty( + uchar slice_type, uchar qp); +uint2 __ovld +intel_sub_group_avc_mce_get_default_inter_motion_vector_cost_table( + uchar slice_type, uchar qp); +uchar __ovld intel_sub_group_avc_mce_get_default_intra_luma_mode_penalty( + uchar slice_type, uchar qp); + +uint2 __ovld intel_sub_group_avc_mce_get_default_high_penalty_cost_table(); +uint2 __ovld intel_sub_group_avc_mce_get_default_medium_penalty_cost_table(); +uint2 __ovld intel_sub_group_avc_mce_get_default_low_penalty_cost_table(); +uint __ovld intel_sub_group_avc_mce_get_default_non_dc_luma_intra_penalty(); +uchar __ovld +intel_sub_group_avc_mce_get_default_intra_chroma_mode_base_penalty(); + +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_mce_set_inter_base_multi_reference_penalty( + uchar reference_base_penalty, intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_mce_set_inter_shape_penalty( + ulong packed_shape_penalty, intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_mce_set_inter_direction_penalty( + uchar direction_cost, intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_mce_set_motion_vector_cost_function( + ulong packed_cost_center_delta, uint2 packed_cost_table, + uchar cost_precision, intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_mce_set_ac_only_haar( + intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_mce_set_source_interlaced_field_polarity( + uchar src_field_polarity, intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_mce_set_single_reference_interlaced_field_polarity( + uchar ref_field_polarity, intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_mce_set_dual_reference_interlaced_field_polarities( + uchar fwd_ref_field_polarity, uchar bwd_ref_field_polarity, + intel_sub_group_avc_mce_payload_t payload); + +ulong __ovld intel_sub_group_avc_mce_get_motion_vectors( + intel_sub_group_avc_mce_result_t result); +ushort __ovld intel_sub_group_avc_mce_get_inter_distortions( + intel_sub_group_avc_mce_result_t result); +ushort __ovld intel_sub_group_avc_mce_get_best_inter_distortion( + intel_sub_group_avc_mce_result_t result); +uchar __ovld intel_sub_group_avc_mce_get_inter_major_shape( + intel_sub_group_avc_mce_result_t result); +uchar __ovld intel_sub_group_avc_mce_get_inter_minor_shapes( + intel_sub_group_avc_mce_result_t result); +uchar __ovld intel_sub_group_avc_mce_get_inter_directions( + intel_sub_group_avc_mce_result_t result); +uchar __ovld intel_sub_group_avc_mce_get_inter_motion_vector_count( + intel_sub_group_avc_mce_result_t result); +uint __ovld intel_sub_group_avc_mce_get_inter_reference_ids( + intel_sub_group_avc_mce_result_t result); +uchar __ovld +intel_sub_group_avc_mce_get_inter_reference_interlaced_field_polarities( + uint packed_reference_ids, uint packed_reference_parameter_field_polarities, + intel_sub_group_avc_mce_result_t result); + +// IME built-in functions +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_initialize( + ushort2 src_coord, uchar partition_mask, uchar sad_adjustment); +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_single_reference( + short2 ref_offset, uchar search_window_config, + intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_dual_reference( + short2 fwd_ref_offset, short2 bwd_ref_offset, uchar search_window_config, + intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_max_motion_vector_count( + uchar max_motion_vector_count, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_unidirectional_mix_disable( + intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_early_search_termination_threshold( + uchar threshold, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_weighted_sad( + uint packed_sad_weights, intel_sub_group_avc_ime_payload_t payload); + +__attribute__((deprecated("If you use the latest Intel driver, please use " + "intel_sub_group_avc_ime_ref_window_size instead", + "intel_sub_group_avc_ime_ref_window_size"))) +ushort2 __ovld +intel_sub_group_ime_ref_window_size(uchar search_window_config, char dual_ref); +ushort2 __ovld intel_sub_group_avc_ime_ref_window_size( + uchar search_window_config, char dual_ref); +short2 __ovld intel_sub_group_avc_ime_adjust_ref_offset( + short2 ref_offset, ushort2 src_coord, ushort2 ref_window_size, + ushort2 image_size); + +intel_sub_group_avc_ime_result_t __ovld +intel_sub_group_avc_ime_evaluate_with_single_reference( + read_only image2d_t src_image, read_only image2d_t ref_image, + sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_result_t __ovld +intel_sub_group_avc_ime_evaluate_with_dual_reference( + read_only image2d_t src_image, read_only image2d_t fwd_ref_image, + read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler, + intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_result_single_reference_streamout_t __ovld +intel_sub_group_avc_ime_evaluate_with_single_reference_streamout( + read_only image2d_t src_image, read_only image2d_t ref_image, + sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_result_dual_reference_streamout_t __ovld +intel_sub_group_avc_ime_evaluate_with_dual_reference_streamout( + read_only image2d_t src_image, read_only image2d_t fwd_ref_image, + read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler, + intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_result_t __ovld +intel_sub_group_avc_ime_evaluate_with_single_reference_streamin( + read_only image2d_t src_image, read_only image2d_t ref_image, + sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload, + intel_sub_group_avc_ime_single_reference_streamin_t streamin_components); +intel_sub_group_avc_ime_result_t __ovld +intel_sub_group_avc_ime_evaluate_with_dual_reference_streamin( + read_only image2d_t src_image, read_only image2d_t fwd_ref_image, + read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler, + intel_sub_group_avc_ime_payload_t payload, + intel_sub_group_avc_ime_dual_reference_streamin_t streamin_components); +intel_sub_group_avc_ime_result_single_reference_streamout_t __ovld +intel_sub_group_avc_ime_evaluate_with_single_reference_streaminout( + read_only image2d_t src_image, read_only image2d_t ref_image, + sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload, + intel_sub_group_avc_ime_single_reference_streamin_t streamin_components); +intel_sub_group_avc_ime_result_dual_reference_streamout_t __ovld +intel_sub_group_avc_ime_evaluate_with_dual_reference_streaminout( + read_only image2d_t src_image, read_only image2d_t fwd_ref_image, + read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler, + intel_sub_group_avc_ime_payload_t payload, + intel_sub_group_avc_ime_dual_reference_streamin_t streamin_components); + +intel_sub_group_avc_ime_single_reference_streamin_t __ovld +intel_sub_group_avc_ime_get_single_reference_streamin( + intel_sub_group_avc_ime_result_single_reference_streamout_t result); +intel_sub_group_avc_ime_dual_reference_streamin_t __ovld +intel_sub_group_avc_ime_get_dual_reference_streamin( + intel_sub_group_avc_ime_result_dual_reference_streamout_t result); +intel_sub_group_avc_ime_result_t __ovld +intel_sub_group_avc_ime_strip_single_reference_streamout( + intel_sub_group_avc_ime_result_single_reference_streamout_t result); +intel_sub_group_avc_ime_result_t __ovld +intel_sub_group_avc_ime_strip_dual_reference_streamout( + intel_sub_group_avc_ime_result_dual_reference_streamout_t result); + +uint __ovld intel_sub_group_avc_ime_get_streamout_major_shape_motion_vectors( + intel_sub_group_avc_ime_result_single_reference_streamout_t result, + uchar major_shape); +ushort __ovld intel_sub_group_avc_ime_get_streamout_major_shape_distortions( + intel_sub_group_avc_ime_result_single_reference_streamout_t result, + uchar major_shape); +uchar __ovld intel_sub_group_avc_ime_get_streamout_major_shape_reference_ids( + intel_sub_group_avc_ime_result_single_reference_streamout_t result, + uchar major_shape); +uint __ovld intel_sub_group_avc_ime_get_streamout_major_shape_motion_vectors( + intel_sub_group_avc_ime_result_dual_reference_streamout_t result, + uchar major_shape, uchar direction); +ushort __ovld intel_sub_group_avc_ime_get_streamout_major_shape_distortions( + intel_sub_group_avc_ime_result_dual_reference_streamout_t result, + uchar major_shape, uchar direction); +uchar __ovld intel_sub_group_avc_ime_get_streamout_major_shape_reference_ids( + intel_sub_group_avc_ime_result_dual_reference_streamout_t result, + uchar major_shape, uchar direction); + +uchar __ovld intel_sub_group_avc_ime_get_border_reached( + uchar image_select, intel_sub_group_avc_ime_result_t result); +uchar __ovld intel_sub_group_avc_ime_get_truncated_search_indication( + intel_sub_group_avc_ime_result_t result); +uchar __ovld +intel_sub_group_avc_ime_get_unidirectional_early_search_termination( + intel_sub_group_avc_ime_result_t result); +uint __ovld intel_sub_group_avc_ime_get_weighting_pattern_minimum_motion_vector( + intel_sub_group_avc_ime_result_t result); +ushort __ovld intel_sub_group_avc_ime_get_weighting_pattern_minimum_distortion( + intel_sub_group_avc_ime_result_t result); + +// REF built-in functions +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_fme_initialize( + ushort2 src_coord, ulong motion_vectors, uchar major_shapes, + uchar minor_shapes, uchar directions, uchar pixel_resolution, + uchar sad_adjustment); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_bme_initialize( + ushort2 src_coord, ulong motion_vectors, uchar major_shapes, + uchar minor_shapes, uchar directions, uchar pixel_resolution, + uchar bidirectional_weight, uchar sad_adjustment); + +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_bidirectional_mix_disable( + intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_bilinear_filter_enable( + intel_sub_group_avc_ref_payload_t payload); + +intel_sub_group_avc_ref_result_t __ovld +intel_sub_group_avc_ref_evaluate_with_single_reference( + read_only image2d_t src_image, read_only image2d_t ref_image, + sampler_t vme_media_sampler, intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_ref_result_t __ovld +intel_sub_group_avc_ref_evaluate_with_dual_reference( + read_only image2d_t src_image, read_only image2d_t fwd_ref_image, + read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler, + intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_ref_result_t __ovld +intel_sub_group_avc_ref_evaluate_with_multi_reference( + read_only image2d_t src_image, uint packed_reference_ids, + sampler_t vme_media_sampler, intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_ref_result_t __ovld +intel_sub_group_avc_ref_evaluate_with_multi_reference( + read_only image2d_t src_image, uint packed_reference_ids, + uchar packed_reference_field_polarities, sampler_t vme_media_sampler, + intel_sub_group_avc_ref_payload_t payload); + +// SIC built-in functions +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_initialize( + ushort2 src_coord); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_configure_skc( + uint skip_block_partition_type, uint skip_motion_vector_mask, + ulong motion_vectors, uchar bidirectional_weight, uchar skip_sad_adjustment, + intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_configure_ipe( + uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty, + uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel, + uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels, + uchar intra_sad_adjustment, intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_configure_ipe( + uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty, + uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel, + uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels, + ushort left_edge_chroma_pixels, ushort upper_left_corner_chroma_pixel, + ushort upper_edge_chroma_pixels, uchar intra_sad_adjustment, + intel_sub_group_avc_sic_payload_t payload); +uint __ovld +intel_sub_group_avc_sic_get_motion_vector_mask( + uint skip_block_partition_type, uchar direction); + +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_intra_luma_shape_penalty( + uint packed_shape_cost, intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_intra_luma_mode_cost_function( + uchar luma_mode_penalty, uint luma_packed_neighbor_modes, + uint luma_packed_non_dc_penalty, intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_intra_chroma_mode_cost_function( + uchar chroma_mode_penalty, intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_skc_bilinear_filter_enable( + intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_skc_forward_transform_enable( + ulong packed_sad_coefficients, intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_block_based_raw_skip_sad( + uchar block_based_skip_type, + intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_sic_result_t __ovld +intel_sub_group_avc_sic_evaluate_ipe( + read_only image2d_t src_image, sampler_t vme_media_sampler, + intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_result_t __ovld +intel_sub_group_avc_sic_evaluate_with_single_reference( + read_only image2d_t src_image, read_only image2d_t ref_image, + sampler_t vme_media_sampler, intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_result_t __ovld +intel_sub_group_avc_sic_evaluate_with_dual_reference( + read_only image2d_t src_image, read_only image2d_t fwd_ref_image, + read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler, + intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_result_t __ovld +intel_sub_group_avc_sic_evaluate_with_multi_reference( + read_only image2d_t src_image, uint packed_reference_ids, + sampler_t vme_media_sampler, intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_result_t __ovld +intel_sub_group_avc_sic_evaluate_with_multi_reference( + read_only image2d_t src_image, uint packed_reference_ids, + uchar packed_reference_field_polarities, sampler_t vme_media_sampler, + intel_sub_group_avc_sic_payload_t payload); + +uchar __ovld intel_sub_group_avc_sic_get_ipe_luma_shape( + intel_sub_group_avc_sic_result_t result); +ushort __ovld intel_sub_group_avc_sic_get_best_ipe_luma_distortion( + intel_sub_group_avc_sic_result_t result); +ushort __ovld intel_sub_group_avc_sic_get_best_ipe_chroma_distortion( + intel_sub_group_avc_sic_result_t result); +ulong __ovld intel_sub_group_avc_sic_get_packed_ipe_luma_modes( + intel_sub_group_avc_sic_result_t result); +uchar __ovld intel_sub_group_avc_sic_get_ipe_chroma_mode( + intel_sub_group_avc_sic_result_t result); +uint __ovld intel_sub_group_avc_sic_get_packed_skc_luma_count_threshold( + intel_sub_group_avc_sic_result_t result); +ulong __ovld intel_sub_group_avc_sic_get_packed_skc_luma_sum_threshold( + intel_sub_group_avc_sic_result_t result); +ushort __ovld intel_sub_group_avc_sic_get_inter_raw_sads( + intel_sub_group_avc_sic_result_t result); + +// Wrappers +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_inter_base_multi_reference_penalty( + uchar reference_base_penalty, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_inter_base_multi_reference_penalty( + uchar reference_base_penalty, intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_inter_base_multi_reference_penalty( + uchar reference_base_penalty, intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_inter_shape_penalty( + ulong packed_shape_cost, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_inter_shape_penalty( + ulong packed_shape_cost, intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_inter_shape_penalty( + ulong packed_shape_cost, intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_inter_direction_penalty( + uchar direction_cost, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_inter_direction_penalty( + uchar direction_cost, intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_inter_direction_penalty( + uchar direction_cost, intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_motion_vector_cost_function( + ulong packed_cost_center_delta, uint2 packed_cost_table, + uchar cost_precision, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_motion_vector_cost_function( + ulong packed_cost_center_delta, uint2 packed_cost_table, + uchar cost_precision, intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_motion_vector_cost_function( + ulong packed_cost_center_delta, uint2 packed_cost_table, + uchar cost_precision, intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_source_interlaced_field_polarity( + uchar src_field_polarity, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_source_interlaced_field_polarity( + uchar src_field_polarity, intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_source_interlaced_field_polarity( + uchar src_field_polarity, intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_single_reference_interlaced_field_polarity( + uchar ref_field_polarity, intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_single_reference_interlaced_field_polarity( + uchar ref_field_polarity, intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_single_reference_interlaced_field_polarity( + uchar ref_field_polarity, intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_dual_reference_interlaced_field_polarities( + uchar fwd_ref_field_polarity, uchar bwd_ref_field_polarity, + intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_dual_reference_interlaced_field_polarities( + uchar fwd_ref_field_polarity, uchar bwd_ref_field_polarity, + intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_dual_reference_interlaced_field_polarities( + uchar fwd_ref_field_polarity, uchar bwd_ref_field_polarity, + intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_ime_set_ac_only_haar( + intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_ref_set_ac_only_haar( + intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_sic_set_ac_only_haar( + intel_sub_group_avc_sic_payload_t payload); + +ulong __ovld intel_sub_group_avc_ime_get_motion_vectors( + intel_sub_group_avc_ime_result_t result); +ulong __ovld intel_sub_group_avc_ref_get_motion_vectors( + intel_sub_group_avc_ref_result_t result); + +ushort __ovld intel_sub_group_avc_ime_get_inter_distortions( + intel_sub_group_avc_ime_result_t result); +ushort __ovld intel_sub_group_avc_ref_get_inter_distortions( + intel_sub_group_avc_ref_result_t result); +ushort __ovld intel_sub_group_avc_sic_get_inter_distortions( + intel_sub_group_avc_sic_result_t result); + +ushort __ovld intel_sub_group_avc_ime_get_best_inter_distortion( + intel_sub_group_avc_ime_result_t result); +ushort __ovld intel_sub_group_avc_ref_get_best_inter_distortion( + intel_sub_group_avc_ref_result_t result); + +uchar __ovld intel_sub_group_avc_ime_get_inter_major_shape( + intel_sub_group_avc_ime_result_t result); +uchar __ovld intel_sub_group_avc_ref_get_inter_major_shape( + intel_sub_group_avc_ref_result_t result); +uchar __ovld intel_sub_group_avc_ime_get_inter_minor_shapes( + intel_sub_group_avc_ime_result_t result); +uchar __ovld intel_sub_group_avc_ref_get_inter_minor_shapes( + intel_sub_group_avc_ref_result_t result); + +uchar __ovld intel_sub_group_avc_ime_get_inter_directions( + intel_sub_group_avc_ime_result_t result); +uchar __ovld intel_sub_group_avc_ref_get_inter_directions( + intel_sub_group_avc_ref_result_t result); + +uchar __ovld intel_sub_group_avc_ime_get_inter_motion_vector_count( + intel_sub_group_avc_ime_result_t result); +uchar __ovld intel_sub_group_avc_ref_get_inter_motion_vector_count( + intel_sub_group_avc_ref_result_t result); + +uint __ovld intel_sub_group_avc_ime_get_inter_reference_ids( + intel_sub_group_avc_ime_result_t result); +uint __ovld intel_sub_group_avc_ref_get_inter_reference_ids( + intel_sub_group_avc_ref_result_t result); + +uchar __ovld +intel_sub_group_avc_ime_get_inter_reference_interlaced_field_polarities( + uint packed_reference_ids, uint packed_reference_parameter_field_polarities, + intel_sub_group_avc_ime_result_t result); +uchar __ovld +intel_sub_group_avc_ref_get_inter_reference_interlaced_field_polarities( + uint packed_reference_ids, uint packed_reference_parameter_field_polarities, + intel_sub_group_avc_ref_result_t result); + +// Type conversion functions +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_ime_convert_to_mce_payload( + intel_sub_group_avc_ime_payload_t payload); +intel_sub_group_avc_ime_payload_t __ovld +intel_sub_group_avc_mce_convert_to_ime_payload( + intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_ref_convert_to_mce_payload( + intel_sub_group_avc_ref_payload_t payload); +intel_sub_group_avc_ref_payload_t __ovld +intel_sub_group_avc_mce_convert_to_ref_payload( + intel_sub_group_avc_mce_payload_t payload); +intel_sub_group_avc_mce_payload_t __ovld +intel_sub_group_avc_sic_convert_to_mce_payload( + intel_sub_group_avc_sic_payload_t payload); +intel_sub_group_avc_sic_payload_t __ovld +intel_sub_group_avc_mce_convert_to_sic_payload( + intel_sub_group_avc_mce_payload_t payload); + +intel_sub_group_avc_mce_result_t __ovld +intel_sub_group_avc_ime_convert_to_mce_result( + intel_sub_group_avc_ime_result_t result); +intel_sub_group_avc_ime_result_t __ovld +intel_sub_group_avc_mce_convert_to_ime_result( + intel_sub_group_avc_mce_result_t result); +intel_sub_group_avc_mce_result_t __ovld +intel_sub_group_avc_ref_convert_to_mce_result( + intel_sub_group_avc_ref_result_t result); +intel_sub_group_avc_ref_result_t __ovld +intel_sub_group_avc_mce_convert_to_ref_result( + intel_sub_group_avc_mce_result_t result); +intel_sub_group_avc_mce_result_t __ovld +intel_sub_group_avc_sic_convert_to_mce_result( + intel_sub_group_avc_sic_result_t result); +intel_sub_group_avc_sic_result_t __ovld +intel_sub_group_avc_mce_convert_to_sic_result( + intel_sub_group_avc_mce_result_t result); +#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end +#endif // cl_intel_device_side_avc_motion_estimation + #ifdef cl_amd_media_ops uint __ovld amd_bitalign(uint a, uint b, uint c); uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c); diff --git a/lib/Headers/unwind.h b/lib/Headers/unwind.h index 345fa4d0c193..0e8317e5b9d9 100644 --- a/lib/Headers/unwind.h +++ b/lib/Headers/unwind.h @@ -154,8 +154,12 @@ struct _Unwind_Control_Block { struct _Unwind_Exception { _Unwind_Exception_Class exception_class; _Unwind_Exception_Cleanup_Fn exception_cleanup; +#if !defined (__USING_SJLJ_EXCEPTIONS__) && defined (__SEH__) + _Unwind_Word private_[6]; +#else _Unwind_Word private_1; _Unwind_Word private_2; +#endif /* The Itanium ABI requires that _Unwind_Exception objects are "double-word * aligned". GCC has interpreted this to mean "use the maximum useful * alignment for the target"; so do we. */ diff --git a/lib/Headers/vecintrin.h b/lib/Headers/vecintrin.h index f7061e88949f..e627389838df 100644 --- a/lib/Headers/vecintrin.h +++ b/lib/Headers/vecintrin.h @@ -381,7 +381,7 @@ vec_insert_and_zero(const unsigned long long *__ptr) { static inline __ATTRS_o_ai vector float vec_insert_and_zero(const float *__ptr) { vector float __vec = (vector float)0; - __vec[0] = *__ptr; + __vec[1] = *__ptr; return __vec; } #endif @@ -5942,13 +5942,13 @@ vec_orc(vector unsigned long long __a, vector unsigned long long __b) { static inline __ATTRS_o_ai vector float vec_orc(vector float __a, vector float __b) { - return (vector float)((vector unsigned int)__a & + return (vector float)((vector unsigned int)__a | ~(vector unsigned int)__b); } static inline __ATTRS_o_ai vector double vec_orc(vector double __a, vector double __b) { - return (vector double)((vector unsigned long long)__a & + return (vector double)((vector unsigned long long)__a | ~(vector unsigned long long)__b); } #endif |
