diff options
Diffstat (limited to 'contrib/llvm-project/clang/lib/Headers/ppc_wrappers')
13 files changed, 4231 insertions, 3380 deletions
diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/bmi2intrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/bmi2intrin.h new file mode 100644 index 000000000000..0dc0d14ad480 --- /dev/null +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/bmi2intrin.h @@ -0,0 +1,134 @@ +/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined X86GPRINTRIN_H_ +#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." +#endif + +#ifndef BMI2INTRIN_H_ +#define BMI2INTRIN_H_ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u32(unsigned int __X, unsigned int __Y) { + return ((__X << (32 - __Y)) >> (32 - __Y)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { + unsigned long long __res = (unsigned long long)__X * __Y; + *__P = (unsigned int)(__res >> 32); + return (unsigned int)__res; +} + +#ifdef __PPC64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u64(unsigned long long __X, unsigned long long __Y) { + return ((__X << (64 - __Y)) >> (64 - __Y)); +} + +/* __int128 requires base 64-bit. */ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u64(unsigned long long __X, unsigned long long __Y, + unsigned long long *__P) { + unsigned __int128 __res = (unsigned __int128)__X * __Y; + *__P = (unsigned long long)(__res >> 64); + return (unsigned long long)__res; +} + +#ifdef _ARCH_PWR7 +/* popcount and bpermd require power7 minimum. */ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u64(unsigned long long __X, unsigned long long __M) { + unsigned long __result = 0x0UL; + const unsigned long __mask = 0x8000000000000000UL; + unsigned long __m = __M; + unsigned long __c, __t; + unsigned long __p; + + /* The pop-count of the mask gives the number of the bits from + source to process. This is also needed to shift bits from the + source into the correct position for the result. */ + __p = 64 - __builtin_popcountl(__M); + + /* The loop is for the number of '1' bits in the mask and clearing + each mask bit as it is processed. */ + while (__m != 0) { + __c = __builtin_clzl(__m); + __t = __X << (__p - __c); + __m ^= (__mask >> __c); + __result |= (__t & (__mask >> __c)); + __p++; + } + return __result; +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u64(unsigned long long __X, unsigned long long __M) { + unsigned long __p = 0x4040404040404040UL; // initial bit permute control + const unsigned long __mask = 0x8000000000000000UL; + unsigned long __m = __M; + unsigned long __c; + unsigned long __result; + + /* if the mask is constant and selects 8 bits or less we can use + the Power8 Bit permute instruction. */ + if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { + /* Also if the pext mask is constant, then the popcount is + constant, we can evaluate the following loop at compile + time and use a constant bit permute vector. */ + long __i; + for (__i = 0; __i < __builtin_popcountl(__M); __i++) { + __c = __builtin_clzl(__m); + __p = (__p << 8) | __c; + __m ^= (__mask >> __c); + } + __result = __builtin_bpermd(__p, __X); + } else { + __p = 64 - __builtin_popcountl(__M); + __result = 0; + /* We could a use a for loop here, but that combined with + -funroll-loops can expand to a lot of code. The while + loop avoids unrolling and the compiler commons the xor + from clearing the mask bit with the (m != 0) test. The + result is a more compact loop setup and body. */ + while (__m != 0) { + unsigned long __t; + __c = __builtin_clzl(__m); + __t = (__X & (__mask >> __c)) >> (__p - __c); + __m ^= (__mask >> __c); + __result |= (__t); + __p++; + } + } + return __result; +} + +/* these 32-bit implementations depend on 64-bit pdep/pext + which depend on _ARCH_PWR7. */ +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u32(unsigned int __X, unsigned int __Y) { + return _pdep_u64(__X, __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u32(unsigned int __X, unsigned int __Y) { + return _pext_u64(__X, __Y); +} +#endif /* _ARCH_PWR7 */ +#endif /* __PPC64__ */ + +#endif /* BMI2INTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/bmiintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/bmiintrin.h new file mode 100644 index 000000000000..7d3315958c7b --- /dev/null +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/bmiintrin.h @@ -0,0 +1,165 @@ +/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined X86GPRINTRIN_H_ +#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead." +#endif + +#ifndef BMIINTRIN_H_ +#define BMIINTRIN_H_ + +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u16(unsigned short __X) { + return __builtin_ctz(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u32(unsigned int __X, unsigned int __Y) { + return (~__X & __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) { + return ((__X << (32 - (__L + __P))) >> (32 - __L)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u32(unsigned int __X, unsigned int __Y) { + unsigned int __P, __L; + __P = __Y & 0xFF; + __L = (__Y >> 8) & 0xFF; + return (_bextr_u32(__X, __P, __L)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u32(unsigned int __X) { + return (__X & -__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u32(unsigned int __X) { + return __blsi_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u32(unsigned int __X) { + return (__X ^ (__X - 1)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u32(unsigned int __X) { + return __blsmsk_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u32(unsigned int __X) { + return (__X & (__X - 1)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u32(unsigned int __X) { + return __blsr_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u32(unsigned int __X) { + return __builtin_ctz(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u32(unsigned int __X) { + return __builtin_ctz(__X); +} + +/* use the 64-bit shift, rotate, and count leading zeros instructions + for long long. */ +#ifdef __PPC64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u64(unsigned long long __X, unsigned long long __Y) { + return (~__X & __Y); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) { + return ((__X << (64 - (__L + __P))) >> (64 - __L)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u64(unsigned long long __X, unsigned long long __Y) { + unsigned int __P, __L; + __P = __Y & 0xFF; + __L = (__Y & 0xFF00) >> 8; + return (_bextr_u64(__X, __P, __L)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u64(unsigned long long __X) { + return __X & -__X; +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u64(unsigned long long __X) { + return __blsi_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u64(unsigned long long __X) { + return (__X ^ (__X - 1)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u64(unsigned long long __X) { + return __blsmsk_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u64(unsigned long long __X) { + return (__X & (__X - 1)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u64(unsigned long long __X) { + return __blsr_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u64(unsigned long long __X) { + return __builtin_ctzll(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u64(unsigned long long __X) { + return __builtin_ctzll(__X); +} +#endif /* __PPC64__ */ + +#endif /* BMIINTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/emmintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/emmintrin.h index 4dcb8485e2e9..fc18ab9d43b1 100644 --- a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/emmintrin.h +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/emmintrin.h @@ -29,13 +29,15 @@ efficiently as C language float scalar operations or optimized to use vector SIMD operations. We recommend this for new applications. */ -#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#error \ + "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." #endif #ifndef EMMINTRIN_H_ #define EMMINTRIN_H_ -#if defined(__linux__) && defined(__ppc64__) +#if defined(__powerpc64__) && \ + (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) #include <altivec.h> @@ -44,6 +46,7 @@ /* SSE2 */ typedef __vector double __v2df; +typedef __vector float __v4f; typedef __vector long long __v2di; typedef __vector unsigned long long __v2du; typedef __vector int __v4si; @@ -55,523 +58,515 @@ typedef __vector unsigned char __v16qu; /* The Intel API is flexible enough that we must allow aliasing with other vector types, and their scalar components. */ -typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); -typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); +typedef double __m128d __attribute__((__vector_size__(16), __may_alias__)); /* Unaligned version of the same types. */ -typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); -typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef long long __m128i_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef double __m128d_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); /* Define two value permute mask. */ -#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y)) +#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) /* Create a vector with element 0 as F and the rest zero. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_sd (double __F) -{ - return __extension__ (__m128d){ __F, 0.0 }; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_sd(double __F) { + return __extension__(__m128d){__F, 0.0}; } /* Create a vector with both elements equal to F. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set1_pd (double __F) -{ - return __extension__ (__m128d){ __F, __F }; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_pd(double __F) { + return __extension__(__m128d){__F, __F}; } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_pd1 (double __F) -{ - return _mm_set1_pd (__F); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pd1(double __F) { + return _mm_set1_pd(__F); } /* Create a vector with the lower value X and upper value W. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_pd (double __W, double __X) -{ - return __extension__ (__m128d){ __X, __W }; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pd(double __W, double __X) { + return __extension__(__m128d){__X, __W}; } /* Create a vector with the lower value W and upper value X. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setr_pd (double __W, double __X) -{ - return __extension__ (__m128d){ __W, __X }; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_pd(double __W, double __X) { + return __extension__(__m128d){__W, __X}; } /* Create an undefined vector. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_undefined_pd (void) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_undefined_pd(void) { __m128d __Y = __Y; return __Y; } /* Create a vector of zeros. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setzero_pd (void) -{ - return (__m128d) vec_splats (0); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_pd(void) { + return (__m128d)vec_splats(0); } /* Sets the low DPFP value of A from the low value of B. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_move_sd (__m128d __A, __m128d __B) -{ - __v2df result = (__v2df) __A; - result [0] = ((__v2df) __B)[0]; - return (__m128d) result; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_move_sd(__m128d __A, __m128d __B) { + __v2df __result = (__v2df)__A; + __result[0] = ((__v2df)__B)[0]; + return (__m128d)__result; } /* Load two DPFP values from P. The address must be 16-byte aligned. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load_pd (double const *__P) -{ - return ((__m128d)vec_ld(0, (__v16qu*)__P)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_pd(double const *__P) { + return ((__m128d)vec_ld(0, (__v16qu *)__P)); } /* Load two DPFP values from P. The address need not be 16-byte aligned. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadu_pd (double const *__P) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadu_pd(double const *__P) { return (vec_vsx_ld(0, __P)); } /* Create a vector with all two elements equal to *P. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load1_pd (double const *__P) -{ - return (vec_splats (*__P)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load1_pd(double const *__P) { + return (vec_splats(*__P)); } /* Create a vector with element 0 as *P and the rest zero. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load_sd (double const *__P) -{ - return _mm_set_sd (*__P); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_sd(double const *__P) { + return _mm_set_sd(*__P); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load_pd1 (double const *__P) -{ - return _mm_load1_pd (__P); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_pd1(double const *__P) { + return _mm_load1_pd(__P); } /* Load two DPFP values in reverse order. The address must be aligned. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadr_pd (double const *__P) -{ - __v2df __tmp = _mm_load_pd (__P); - return (__m128d)vec_xxpermdi (__tmp, __tmp, 2); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadr_pd(double const *__P) { + __v2df __tmp = _mm_load_pd(__P); + return (__m128d)vec_xxpermdi(__tmp, __tmp, 2); } /* Store two DPFP values. The address must be 16-byte aligned. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store_pd (double *__P, __m128d __A) -{ - vec_st((__v16qu)__A, 0, (__v16qu*)__P); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_pd(double *__P, __m128d __A) { + vec_st((__v16qu)__A, 0, (__v16qu *)__P); } /* Store two DPFP values. The address need not be 16-byte aligned. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storeu_pd (double *__P, __m128d __A) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeu_pd(double *__P, __m128d __A) { *(__m128d_u *)__P = __A; } /* Stores the lower DPFP value. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store_sd (double *__P, __m128d __A) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_sd(double *__P, __m128d __A) { *__P = ((__v2df)__A)[0]; } -extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_f64 (__m128d __A) -{ +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_f64(__m128d __A) { return ((__v2df)__A)[0]; } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storel_pd (double *__P, __m128d __A) -{ - _mm_store_sd (__P, __A); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storel_pd(double *__P, __m128d __A) { + _mm_store_sd(__P, __A); } /* Stores the upper DPFP value. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storeh_pd (double *__P, __m128d __A) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeh_pd(double *__P, __m128d __A) { *__P = ((__v2df)__A)[1]; } /* Store the lower DPFP value across two words. The address must be 16-byte aligned. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store1_pd (double *__P, __m128d __A) -{ - _mm_store_pd (__P, vec_splat (__A, 0)); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store1_pd(double *__P, __m128d __A) { + _mm_store_pd(__P, vec_splat(__A, 0)); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store_pd1 (double *__P, __m128d __A) -{ - _mm_store1_pd (__P, __A); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_pd1(double *__P, __m128d __A) { + _mm_store1_pd(__P, __A); } /* Store two DPFP values in reverse order. The address must be aligned. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storer_pd (double *__P, __m128d __A) -{ - _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2)); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storer_pd(double *__P, __m128d __A) { + _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2)); } /* Intel intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi128_si64 (__m128i __A) -{ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi128_si64(__m128i __A) { return ((__v2di)__A)[0]; } /* Microsoft intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi128_si64x (__m128i __A) -{ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi128_si64x(__m128i __A) { return ((__v2di)__A)[0]; } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_pd (__m128d __A, __m128d __B) -{ - return (__m128d) ((__v2df)__A + (__v2df)__B); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_pd(__m128d __A, __m128d __B) { + return (__m128d)((__v2df)__A + (__v2df)__B); } /* Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_sd (__m128d __A, __m128d __B) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_sd(__m128d __A, __m128d __B) { __A[0] = __A[0] + __B[0]; return (__A); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_pd (__m128d __A, __m128d __B) -{ - return (__m128d) ((__v2df)__A - (__v2df)__B); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_pd(__m128d __A, __m128d __B) { + return (__m128d)((__v2df)__A - (__v2df)__B); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_sd (__m128d __A, __m128d __B) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_sd(__m128d __A, __m128d __B) { __A[0] = __A[0] - __B[0]; return (__A); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mul_pd (__m128d __A, __m128d __B) -{ - return (__m128d) ((__v2df)__A * (__v2df)__B); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_pd(__m128d __A, __m128d __B) { + return (__m128d)((__v2df)__A * (__v2df)__B); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mul_sd (__m128d __A, __m128d __B) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_sd(__m128d __A, __m128d __B) { __A[0] = __A[0] * __B[0]; return (__A); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_div_pd (__m128d __A, __m128d __B) -{ - return (__m128d) ((__v2df)__A / (__v2df)__B); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_pd(__m128d __A, __m128d __B) { + return (__m128d)((__v2df)__A / (__v2df)__B); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_div_sd (__m128d __A, __m128d __B) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_sd(__m128d __A, __m128d __B) { __A[0] = __A[0] / __B[0]; return (__A); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sqrt_pd (__m128d __A) -{ - return (vec_sqrt (__A)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_pd(__m128d __A) { + return (vec_sqrt(__A)); } /* Return pair {sqrt (B[0]), A[1]}. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sqrt_sd (__m128d __A, __m128d __B) -{ - __v2df c; - c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); - return (__m128d) _mm_setr_pd (c[0], __A[1]); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_sd(__m128d __A, __m128d __B) { + __v2df __c; + __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0])); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_pd (__m128d __A, __m128d __B) -{ - return (vec_min (__A, __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_pd(__m128d __A, __m128d __B) { + return (vec_min(__A, __B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = vec_min (a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); + __c = vec_min(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_pd (__m128d __A, __m128d __B) -{ - return (vec_max (__A, __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_pd(__m128d __A, __m128d __B) { + return (vec_max(__A, __B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = vec_max (a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); + __c = vec_max(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmple_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpge_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpneq_pd (__m128d __A, __m128d __B) -{ - __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); - return ((__m128d)vec_nor (temp, temp)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_pd(__m128d __A, __m128d __B) { + __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B); + return ((__m128d)vec_nor(__temp, __temp)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnlt_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnle_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpngt_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnge_pd (__m128d __A, __m128d __B) -{ - return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_pd(__m128d __A, __m128d __B) { + return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpord_pd (__m128d __A, __m128d __B) -{ -#if _ARCH_PWR8 - __v2du c, d; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_pd(__m128d __A, __m128d __B) { + __v2du __c, __d; /* Compare against self will return false (0's) if NAN. */ - c = (__v2du)vec_cmpeq (__A, __A); - d = (__v2du)vec_cmpeq (__B, __B); -#else - __v2du a, b; - __v2du c, d; - const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; - a = (__v2du)vec_abs ((__v2df)__A); - b = (__v2du)vec_abs ((__v2df)__B); - c = (__v2du)vec_cmpgt (double_exp_mask, a); - d = (__v2du)vec_cmpgt (double_exp_mask, b); -#endif + __c = (__v2du)vec_cmpeq(__A, __A); + __d = (__v2du)vec_cmpeq(__B, __B); /* A != NAN and B != NAN. */ - return ((__m128d)vec_and(c, d)); + return ((__m128d)vec_and(__c, __d)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpunord_pd (__m128d __A, __m128d __B) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_pd(__m128d __A, __m128d __B) { #if _ARCH_PWR8 - __v2du c, d; + __v2du __c, __d; /* Compare against self will return false (0's) if NAN. */ - c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); - d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); + __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); + __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); /* A == NAN OR B == NAN converts too: NOT(A != NAN) OR NOT(B != NAN). */ - c = vec_nor (c, c); - return ((__m128d)vec_orc(c, d)); + __c = vec_nor(__c, __c); + return ((__m128d)vec_orc(__c, __d)); #else - __v2du c, d; + __v2du __c, __d; /* Compare against self will return false (0's) if NAN. */ - c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); - d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); + __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); + __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); /* Convert the true ('1's) is NAN. */ - c = vec_nor (c, c); - d = vec_nor (d, d); - return ((__m128d)vec_or(c, d)); + __c = vec_nor(__c, __c); + __d = vec_nor(__d, __d); + return ((__m128d)vec_or(__c, __d)); #endif } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_sd(__m128d __A, __m128d __B) -{ - __v2df a, b, c; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we do the operation. */ - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmpeq(a, b); + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); + __c = (__v2df)vec_cmpeq(__a, __b); /* Then we merge the lower double result with the original upper double from __A. */ - return (__m128d) _mm_setr_pd (c[0], __A[1]); -} - -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmplt(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); -} - -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmple_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmple(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); -} - -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmpgt(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); -} - -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpge_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmpge(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); -} - -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpneq_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmpeq(a, b); - c = vec_nor (c, c); - return (__m128d) _mm_setr_pd (c[0], __A[1]); -} - -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnlt_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); + __c = (__v2df)vec_cmplt(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); + __c = (__v2df)vec_cmple(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); + __c = (__v2df)vec_cmpgt(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); + __c = (__v2df)vec_cmpge(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); + __c = (__v2df)vec_cmpeq(__a, __b); + __c = vec_nor(__c, __c); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); /* Not less than is just greater than or equal. */ - c = (__v2df) vec_cmpge(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __c = (__v2df)vec_cmpge(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnle_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); /* Not less than or equal is just greater than. */ - c = (__v2df) vec_cmpge(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __c = (__v2df)vec_cmpge(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpngt_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); /* Not greater than is just less than or equal. */ - c = (__v2df) vec_cmple(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __c = (__v2df)vec_cmple(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnge_sd (__m128d __A, __m128d __B) -{ - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_sd(__m128d __A, __m128d __B) { + __v2df __a, __b, __c; + __a = vec_splats(__A[0]); + __b = vec_splats(__B[0]); /* Not greater than or equal is just less than. */ - c = (__v2df) vec_cmplt(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __c = (__v2df)vec_cmplt(__a, __b); + return (__m128d)_mm_setr_pd(__c[0], __A[1]); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpord_sd (__m128d __A, __m128d __B) -{ - __v2df r; - r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); - return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_sd(__m128d __A, __m128d __B) { + __v2df __r; + __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0])); + return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpunord_sd (__m128d __A, __m128d __B) -{ - __v2df r; - r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); - return (__m128d) _mm_setr_pd (r[0], __A[1]); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_sd(__m128d __A, __m128d __B) { + __v2df __r; + __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0])); + return (__m128d)_mm_setr_pd(__r[0], __A[1]); } /* FIXME @@ -581,1744 +576,1694 @@ _mm_cmpunord_sd (__m128d __A, __m128d __B) Technically __mm_comieq_sp et all should be using the ordered compare and signal for QNaNs. The __mm_ucomieq_sd et all should be OK. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comieq_sd (__m128d __A, __m128d __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comieq_sd(__m128d __A, __m128d __B) { return (__A[0] == __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comilt_sd (__m128d __A, __m128d __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comilt_sd(__m128d __A, __m128d __B) { return (__A[0] < __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comile_sd (__m128d __A, __m128d __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comile_sd(__m128d __A, __m128d __B) { return (__A[0] <= __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comigt_sd (__m128d __A, __m128d __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comigt_sd(__m128d __A, __m128d __B) { return (__A[0] > __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comige_sd (__m128d __A, __m128d __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comige_sd(__m128d __A, __m128d __B) { return (__A[0] >= __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comineq_sd (__m128d __A, __m128d __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comineq_sd(__m128d __A, __m128d __B) { return (__A[0] != __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomieq_sd (__m128d __A, __m128d __B) -{ - return (__A[0] == __B[0]); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomieq_sd(__m128d __A, __m128d __B) { + return (__A[0] == __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomilt_sd (__m128d __A, __m128d __B) -{ - return (__A[0] < __B[0]); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomilt_sd(__m128d __A, __m128d __B) { + return (__A[0] < __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomile_sd (__m128d __A, __m128d __B) -{ - return (__A[0] <= __B[0]); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomile_sd(__m128d __A, __m128d __B) { + return (__A[0] <= __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomigt_sd (__m128d __A, __m128d __B) -{ - return (__A[0] > __B[0]); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomigt_sd(__m128d __A, __m128d __B) { + return (__A[0] > __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomige_sd (__m128d __A, __m128d __B) -{ - return (__A[0] >= __B[0]); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomige_sd(__m128d __A, __m128d __B) { + return (__A[0] >= __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomineq_sd (__m128d __A, __m128d __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomineq_sd(__m128d __A, __m128d __B) { return (__A[0] != __B[0]); } /* Create a vector of Qi, where i is the element number. */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_epi64x (long long __q1, long long __q0) -{ - return __extension__ (__m128i)(__v2di){ __q0, __q1 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi64x(long long __q1, long long __q0) { + return __extension__(__m128i)(__v2di){__q0, __q1}; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_epi64 (__m64 __q1, __m64 __q0) -{ - return _mm_set_epi64x ((long long)__q1, (long long)__q0); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi64(__m64 __q1, __m64 __q0) { + return _mm_set_epi64x((long long)__q1, (long long)__q0); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) -{ - return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { + return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3}; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, - short __q3, short __q2, short __q1, short __q0) -{ - return __extension__ (__m128i)(__v8hi){ - __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3, + short __q2, short __q1, short __q0) { + return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3, + __q4, __q5, __q6, __q7}; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, - char __q11, char __q10, char __q09, char __q08, - char __q07, char __q06, char __q05, char __q04, - char __q03, char __q02, char __q01, char __q00) -{ - return __extension__ (__m128i)(__v16qi){ - __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, - __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 - }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11, + char __q10, char __q09, char __q08, char __q07, char __q06, + char __q05, char __q04, char __q03, char __q02, char __q01, + char __q00) { + return __extension__(__m128i)(__v16qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; } /* Set all of the elements of the vector to A. */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set1_epi64x (long long __A) -{ - return _mm_set_epi64x (__A, __A); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi64x(long long __A) { + return _mm_set_epi64x(__A, __A); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set1_epi64 (__m64 __A) -{ - return _mm_set_epi64 (__A, __A); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi64(__m64 __A) { + return _mm_set_epi64(__A, __A); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set1_epi32 (int __A) -{ - return _mm_set_epi32 (__A, __A, __A, __A); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi32(int __A) { + return _mm_set_epi32(__A, __A, __A, __A); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set1_epi16 (short __A) -{ - return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi16(short __A) { + return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set1_epi8 (char __A) -{ - return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi8(char __A) { + return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A); } /* Create a vector of Qi, where i is the element number. The parameter order is reversed from the _mm_set_epi* functions. */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setr_epi64 (__m64 __q0, __m64 __q1) -{ - return _mm_set_epi64 (__q1, __q0); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_epi64(__m64 __q0, __m64 __q1) { + return _mm_set_epi64(__q1, __q0); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) -{ - return _mm_set_epi32 (__q3, __q2, __q1, __q0); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { + return _mm_set_epi32(__q3, __q2, __q1, __q0); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, - short __q4, short __q5, short __q6, short __q7) -{ - return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4, + short __q5, short __q6, short __q7) { + return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, - char __q04, char __q05, char __q06, char __q07, - char __q08, char __q09, char __q10, char __q11, - char __q12, char __q13, char __q14, char __q15) -{ - return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, - __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04, + char __q05, char __q06, char __q07, char __q08, char __q09, + char __q10, char __q11, char __q12, char __q13, char __q14, + char __q15) { + return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, + __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); } /* Create a vector with element 0 as *P and the rest zero. */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load_si128 (__m128i const *__P) -{ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_si128(__m128i const *__P) { return *__P; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadu_si128 (__m128i_u const *__P) -{ - return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadu_si128(__m128i_u const *__P) { + return (__m128i)(vec_vsx_ld(0, (signed int const *)__P)); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadl_epi64 (__m128i_u const *__P) -{ - return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadl_epi64(__m128i_u const *__P) { + return _mm_set_epi64((__m64)0LL, *(__m64 *)__P); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store_si128 (__m128i *__P, __m128i __B) -{ - vec_st ((__v16qu) __B, 0, (__v16qu*)__P); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_si128(__m128i *__P, __m128i __B) { + vec_st((__v16qu)__B, 0, (__v16qu *)__P); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storeu_si128 (__m128i_u *__P, __m128i __B) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeu_si128(__m128i_u *__P, __m128i __B) { *__P = __B; } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storel_epi64 (__m128i_u *__P, __m128i __B) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storel_epi64(__m128i_u *__P, __m128i __B) { *(long long *)__P = ((__v2di)__B)[0]; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movepi64_pi64 (__m128i_u __B) -{ - return (__m64) ((__v2di)__B)[0]; +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movepi64_pi64(__m128i_u __B) { + return (__m64)((__v2di)__B)[0]; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movpi64_epi64 (__m64 __A) -{ - return _mm_set_epi64 ((__m64)0LL, __A); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movpi64_epi64(__m64 __A) { + return _mm_set_epi64((__m64)0LL, __A); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_move_epi64 (__m128i __A) -{ - return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_move_epi64(__m128i __A) { + return _mm_set_epi64((__m64)0LL, (__m64)__A[0]); } /* Create an undefined vector. */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_undefined_si128 (void) -{ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_undefined_si128(void) { __m128i __Y = __Y; return __Y; } /* Create a vector of zeros. */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setzero_si128 (void) -{ - return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_si128(void) { + return __extension__(__m128i)(__v4si){0, 0, 0, 0}; } #ifdef _ARCH_PWR8 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepi32_pd (__m128i __A) -{ - __v2di val; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi32_pd(__m128i __A) { + __v2di __val; /* For LE need to generate Vector Unpack Low Signed Word. Which is generated from unpackh. */ - val = (__v2di)vec_unpackh ((__v4si)__A); + __val = (__v2di)vec_unpackh((__v4si)__A); - return (__m128d)vec_ctf (val, 0); + return (__m128d)vec_ctf(__val, 0); } #endif -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepi32_ps (__m128i __A) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi32_ps(__m128i __A) { return ((__m128)vec_ctf((__v4si)__A, 0)); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpd_epi32 (__m128d __A) -{ - __v2df rounded = vec_rint (__A); - __v4si result, temp; - const __v4si vzero = - { 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_epi32(__m128d __A) { + __v2df __rounded = vec_rint(__A); + __v4si __result, __temp; + const __v4si __vzero = {0, 0, 0, 0}; /* VSX Vector truncate Double-Precision to integer and Convert to Signed Integer Word format with Saturate. */ - __asm__( - "xvcvdpsxws %x0,%x1" - : "=wa" (temp) - : "wa" (rounded) - : ); + __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :); #ifdef _ARCH_PWR8 - temp = vec_mergeo (temp, temp); - result = (__v4si) vec_vpkudum ((__vector long long) temp, - (__vector long long) vzero); +#ifdef __LITTLE_ENDIAN__ + __temp = vec_mergeo(__temp, __temp); +#else + __temp = vec_mergee(__temp, __temp); +#endif + __result = (__v4si)vec_vpkudum((__vector long long)__temp, + (__vector long long)__vzero); #else { - const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, - 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; - result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; + __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); } #endif - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpd_pi32 (__m128d __A) -{ - __m128i result = _mm_cvtpd_epi32(__A); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_pi32(__m128d __A) { + __m128i __result = _mm_cvtpd_epi32(__A); - return (__m64) result[0]; + return (__m64)__result[0]; } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpd_ps (__m128d __A) -{ - __v4sf result; - __v4si temp; - const __v4si vzero = { 0, 0, 0, 0 }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_ps(__m128d __A) { + __v4sf __result; + __v4si __temp; + const __v4si __vzero = {0, 0, 0, 0}; - __asm__( - "xvcvdpsp %x0,%x1" - : "=wa" (temp) - : "wa" (__A) - : ); + __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); #ifdef _ARCH_PWR8 - temp = vec_mergeo (temp, temp); - result = (__v4sf) vec_vpkudum ((__vector long long) temp, - (__vector long long) vzero); +#ifdef __LITTLE_ENDIAN__ + __temp = vec_mergeo(__temp, __temp); +#else + __temp = vec_mergee(__temp, __temp); +#endif + __result = (__v4sf)vec_vpkudum((__vector long long)__temp, + (__vector long long)__vzero); #else { - const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, - 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; - result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; + __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); } #endif - return ((__m128)result); + return ((__m128)__result); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttpd_epi32 (__m128d __A) -{ - __v4si result; - __v4si temp; - const __v4si vzero = { 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttpd_epi32(__m128d __A) { + __v4si __result; + __v4si __temp; + const __v4si __vzero = {0, 0, 0, 0}; /* VSX Vector truncate Double-Precision to integer and Convert to Signed Integer Word format with Saturate. */ - __asm__( - "xvcvdpsxws %x0,%x1" - : "=wa" (temp) - : "wa" (__A) - : ); + __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); #ifdef _ARCH_PWR8 - temp = vec_mergeo (temp, temp); - result = (__v4si) vec_vpkudum ((__vector long long) temp, - (__vector long long) vzero); +#ifdef __LITTLE_ENDIAN__ + __temp = vec_mergeo(__temp, __temp); +#else + __temp = vec_mergee(__temp, __temp); +#endif + __result = (__v4si)vec_vpkudum((__vector long long)__temp, + (__vector long long)__vzero); #else { - const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, - 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; - result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; + __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); } #endif - return ((__m128i) result); + return ((__m128i)__result); } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttpd_pi32 (__m128d __A) -{ - __m128i result = _mm_cvttpd_epi32 (__A); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttpd_pi32(__m128d __A) { + __m128i __result = _mm_cvttpd_epi32(__A); - return (__m64) result[0]; + return (__m64)__result[0]; } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi128_si32 (__m128i __A) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi128_si32(__m128i __A) { return ((__v4si)__A)[0]; } #ifdef _ARCH_PWR8 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpi32_pd (__m64 __A) -{ - __v4si temp; - __v2di tmp2; - __v2df result; - - temp = (__v4si)vec_splats (__A); - tmp2 = (__v2di)vec_unpackl (temp); - result = vec_ctf ((__vector signed long long) tmp2, 0); - return (__m128d)result; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi32_pd(__m64 __A) { + __v4si __temp; + __v2di __tmp2; + __v4f __result; + + __temp = (__v4si)vec_splats(__A); + __tmp2 = (__v2di)vec_unpackl(__temp); + __result = vec_ctf((__vector signed long long)__tmp2, 0); + return (__m128d)__result; } #endif -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtps_epi32 (__m128 __A) -{ - __v4sf rounded; - __v4si result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_epi32(__m128 __A) { + __v4sf __rounded; + __v4si __result; - rounded = vec_rint((__v4sf) __A); - result = vec_cts (rounded, 0); - return (__m128i) result; + __rounded = vec_rint((__v4sf)__A); + __result = vec_cts(__rounded, 0); + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttps_epi32 (__m128 __A) -{ - __v4si result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttps_epi32(__m128 __A) { + __v4si __result; - result = vec_cts ((__v4sf) __A, 0); - return (__m128i) result; + __result = vec_cts((__v4sf)__A, 0); + return (__m128i)__result; } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtps_pd (__m128 __A) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pd(__m128 __A) { /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ #ifdef vec_doubleh - return (__m128d) vec_doubleh ((__v4sf)__A); + return (__m128d)vec_doubleh((__v4sf)__A); #else /* Otherwise the compiler is not current and so need to generate the equivalent code. */ - __v4sf a = (__v4sf)__A; - __v4sf temp; - __v2df result; + __v4sf __a = (__v4sf)__A; + __v4sf __temp; + __v2df __result; #ifdef __LITTLE_ENDIAN__ /* The input float values are in elements {[0], [1]} but the convert instruction needs them in elements {[1], [3]}, So we use two shift left double vector word immediates to get the elements lined up. */ - temp = __builtin_vsx_xxsldwi (a, a, 3); - temp = __builtin_vsx_xxsldwi (a, temp, 2); + __temp = __builtin_vsx_xxsldwi(__a, __a, 3); + __temp = __builtin_vsx_xxsldwi(__a, __temp, 2); #else /* The input float values are in elements {[0], [1]} but the convert instruction needs them in elements {[0], [2]}, So we use two shift left double vector word immediates to get the elements lined up. */ - temp = vec_vmrghw (a, a); + __temp = vec_vmrghw(__a, __a); #endif - __asm__( - " xvcvspdp %x0,%x1" - : "=wa" (result) - : "wa" (temp) - : ); - return (__m128d) result; + __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :); + return (__m128d)__result; #endif } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_si32 (__m128d __A) -{ - __v2df rounded = vec_rint((__v2df) __A); - int result = ((__v2df)rounded)[0]; +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_si32(__m128d __A) { + __v2df __rounded = vec_rint((__v2df)__A); + int __result = ((__v2df)__rounded)[0]; - return result; + return __result; } /* Intel intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_si64 (__m128d __A) -{ - __v2df rounded = vec_rint ((__v2df) __A ); - long long result = ((__v2df) rounded)[0]; +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_si64(__m128d __A) { + __v2df __rounded = vec_rint((__v2df)__A); + long long __result = ((__v2df)__rounded)[0]; - return result; + return __result; } /* Microsoft intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_si64x (__m128d __A) -{ - return _mm_cvtsd_si64 ((__v2df)__A); +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_si64x(__m128d __A) { + return _mm_cvtsd_si64((__v2df)__A); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_si32 (__m128d __A) -{ - int result = ((__v2df)__A)[0]; +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_si32(__m128d __A) { + int __result = ((__v2df)__A)[0]; - return result; + return __result; } /* Intel intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_si64 (__m128d __A) -{ - long long result = ((__v2df)__A)[0]; +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_si64(__m128d __A) { + long long __result = ((__v2df)__A)[0]; - return result; + return __result; } /* Microsoft intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_si64x (__m128d __A) -{ - return _mm_cvttsd_si64 (__A); +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_si64x(__m128d __A) { + return _mm_cvttsd_si64(__A); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_ss (__m128 __A, __m128d __B) -{ - __v4sf result = (__v4sf)__A; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_ss(__m128 __A, __m128d __B) { + __v4sf __result = (__v4sf)__A; #ifdef __LITTLE_ENDIAN__ - __v4sf temp_s; + __v4sf __temp_s; /* Copy double element[0] to element [1] for conversion. */ - __v2df temp_b = vec_splat((__v2df)__B, 0); + __v2df __temp_b = vec_splat((__v2df)__B, 0); /* Pre-rotate __A left 3 (logically right 1) elements. */ - result = __builtin_vsx_xxsldwi (result, result, 3); + __result = __builtin_vsx_xxsldwi(__result, __result, 3); /* Convert double to single float scalar in a vector. */ - __asm__( - "xscvdpsp %x0,%x1" - : "=wa" (temp_s) - : "wa" (temp_b) - : ); + __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :); /* Shift the resulting scalar into vector element [0]. */ - result = __builtin_vsx_xxsldwi (result, temp_s, 1); + __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1); #else - result [0] = ((__v2df)__B)[0]; + __result[0] = ((__v2df)__B)[0]; #endif - return (__m128) result; + return (__m128)__result; } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi32_sd (__m128d __A, int __B) -{ - __v2df result = (__v2df)__A; - double db = __B; - result [0] = db; - return (__m128d)result; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_sd(__m128d __A, int __B) { + __v2df __result = (__v2df)__A; + double __db = __B; + __result[0] = __db; + return (__m128d)__result; } /* Intel intrinsic. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi64_sd (__m128d __A, long long __B) -{ - __v2df result = (__v2df)__A; - double db = __B; - result [0] = db; - return (__m128d)result; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_sd(__m128d __A, long long __B) { + __v2df __result = (__v2df)__A; + double __db = __B; + __result[0] = __db; + return (__m128d)__result; } /* Microsoft intrinsic. */ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi64x_sd (__m128d __A, long long __B) -{ - return _mm_cvtsi64_sd (__A, __B); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_sd(__m128d __A, long long __B) { + return _mm_cvtsi64_sd(__A, __B); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_sd (__m128d __A, __m128 __B) -{ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_sd(__m128d __A, __m128 __B) { #ifdef __LITTLE_ENDIAN__ /* Use splat to move element [0] into position for the convert. */ - __v4sf temp = vec_splat ((__v4sf)__B, 0); - __v2df res; + __v4sf __temp = vec_splat((__v4sf)__B, 0); + __v2df __res; /* Convert single float scalar to double in a vector. */ - __asm__( - "xscvspdp %x0,%x1" - : "=wa" (res) - : "wa" (temp) - : ); - return (__m128d) vec_mergel (res, (__v2df)__A); + __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :); + return (__m128d)vec_mergel(__res, (__v2df)__A); #else - __v2df res = (__v2df)__A; - res [0] = ((__v4sf)__B) [0]; - return (__m128d) res; + __v2df __res = (__v2df)__A; + __res[0] = ((__v4sf)__B)[0]; + return (__m128d)__res; #endif } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) -{ - __vector double result; - const int litmsk = __mask & 0x3; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { + __vector double __result; + const int __litmsk = __mask & 0x3; - if (litmsk == 0) - result = vec_mergeh (__A, __B); + if (__litmsk == 0) + __result = vec_mergeh(__A, __B); #if __GNUC__ < 6 - else if (litmsk == 1) - result = vec_xxpermdi (__B, __A, 2); - else if (litmsk == 2) - result = vec_xxpermdi (__B, __A, 1); + else if (__litmsk == 1) + __result = vec_xxpermdi(__B, __A, 2); + else if (__litmsk == 2) + __result = vec_xxpermdi(__B, __A, 1); #else - else if (litmsk == 1) - result = vec_xxpermdi (__A, __B, 2); - else if (litmsk == 2) - result = vec_xxpermdi (__A, __B, 1); + else if (__litmsk == 1) + __result = vec_xxpermdi(__A, __B, 2); + else if (__litmsk == 2) + __result = vec_xxpermdi(__A, __B, 1); #endif else - result = vec_mergel (__A, __B); + __result = vec_mergel(__A, __B); - return result; + return __result; } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpackhi_pd (__m128d __A, __m128d __B) -{ - return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_pd(__m128d __A, __m128d __B) { + return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpacklo_pd (__m128d __A, __m128d __B) -{ - return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_pd(__m128d __A, __m128d __B) { + return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadh_pd (__m128d __A, double const *__B) -{ - __v2df result = (__v2df)__A; - result [1] = *__B; - return (__m128d)result; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadh_pd(__m128d __A, double const *__B) { + __v2df __result = (__v2df)__A; + __result[1] = *__B; + return (__m128d)__result; } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadl_pd (__m128d __A, double const *__B) -{ - __v2df result = (__v2df)__A; - result [0] = *__B; - return (__m128d)result; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadl_pd(__m128d __A, double const *__B) { + __v2df __result = (__v2df)__A; + __result[0] = *__B; + return (__m128d)__result; } #ifdef _ARCH_PWR8 /* Intrinsic functions that require PowerISA 2.07 minimum. */ /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movemask_pd (__m128d __A) -{ - __vector unsigned long long result; - static const __vector unsigned int perm_mask = - { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_pd(__m128d __A) { +#ifdef _ARCH_PWR10 + return vec_extractm((__v2du)__A); +#else + __vector unsigned long long __result; + static const __vector unsigned int __perm_mask = { #ifdef __LITTLE_ENDIAN__ - 0x80800040, 0x80808080, 0x80808080, 0x80808080 + 0x80800040, 0x80808080, 0x80808080, 0x80808080 #else 0x80808080, 0x80808080, 0x80808080, 0x80804000 #endif - }; + }; - result = ((__vector unsigned long long) - vec_vbpermq ((__vector unsigned char) __A, - (__vector unsigned char) perm_mask)); + __result = ((__vector unsigned long long)vec_vbpermq( + (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); #ifdef __LITTLE_ENDIAN__ - return result[1]; + return __result[1]; #else - return result[0]; + return __result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_packs_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_packs_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_epi32(__m128i __A, __m128i __B) { + return (__m128i)vec_packs((__v4si)__A, (__v4si)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_packus_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packus_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpackhi_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_epi8(__m128i __A, __m128i __B) { + return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpackhi_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpackhi_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_epi32(__m128i __A, __m128i __B) { + return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpackhi_epi64 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_mergel ((__vector long long) __A, - (__vector long long) __B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_epi64(__m128i __A, __m128i __B) { + return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpacklo_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_epi8(__m128i __A, __m128i __B) { + return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpacklo_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpacklo_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_epi32(__m128i __A, __m128i __B) { + return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpacklo_epi64 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_mergeh ((__vector long long) __A, - (__vector long long) __B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_epi64(__m128i __A, __m128i __B) { + return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v16qu)__A + (__v16qu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_epi8(__m128i __A, __m128i __B) { + return (__m128i)((__v16qu)__A + (__v16qu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v8hu)__A + (__v8hu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hu)__A + (__v8hu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v4su)__A + (__v4su)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4su)__A + (__v4su)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_epi64 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v2du)__A + (__v2du)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_epi64(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A + (__v2du)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_adds_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_epi8(__m128i __A, __m128i __B) { + return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_adds_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_adds_epu8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_epu8(__m128i __A, __m128i __B) { + return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_adds_epu16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_epu16(__m128i __A, __m128i __B) { + return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v16qu)__A - (__v16qu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_epi8(__m128i __A, __m128i __B) { + return (__m128i)((__v16qu)__A - (__v16qu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v8hu)__A - (__v8hu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hu)__A - (__v8hu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v4su)__A - (__v4su)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4su)__A - (__v4su)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_epi64 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v2du)__A - (__v2du)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_epi64(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A - (__v2du)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_subs_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_epi8(__m128i __A, __m128i __B) { + return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_subs_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_subs_epu8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_epu8(__m128i __A, __m128i __B) { + return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_subs_epu16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_epu16(__m128i __A, __m128i __B) { + return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_madd_epi16 (__m128i __A, __m128i __B) -{ - __vector signed int zero = {0, 0, 0, 0}; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_madd_epi16(__m128i __A, __m128i __B) { + __vector signed int __zero = {0, 0, 0, 0}; - return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero); + return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mulhi_epi16 (__m128i __A, __m128i __B) -{ - __vector signed int w0, w1; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_epi16(__m128i __A, __m128i __B) { + __vector signed int __w0, __w1; - __vector unsigned char xform1 = { + __vector unsigned char __xform1 = { #ifdef __LITTLE_ENDIAN__ - 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, - 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F + 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, + 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F #else - 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, - 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, + 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D #endif - }; + }; - w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); - w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); - return (__m128i) vec_perm (w0, w1, xform1); + __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B); + __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B); + return (__m128i)vec_perm(__w0, __w1, __xform1); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mullo_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) ((__v8hi)__A * (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mullo_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hi)__A * (__v8hi)__B); } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mul_su32 (__m64 __A, __m64 __B) -{ - unsigned int a = __A; - unsigned int b = __B; +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_su32(__m64 __A, __m64 __B) { + unsigned int __a = __A; + unsigned int __b = __B; - return ((__m64)a * (__m64)b); + return ((__m64)__a * (__m64)__b); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mul_epu32 (__m128i __A, __m128i __B) -{ +#ifdef _ARCH_PWR8 +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_epu32(__m128i __A, __m128i __B) { #if __GNUC__ < 8 - __v2du result; + __v2du __result; #ifdef __LITTLE_ENDIAN__ /* VMX Vector Multiply Odd Unsigned Word. */ - __asm__( - "vmulouw %0,%1,%2" - : "=v" (result) - : "v" (__A), "v" (__B) - : ); + __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); #else /* VMX Vector Multiply Even Unsigned Word. */ - __asm__( - "vmuleuw %0,%1,%2" - : "=v" (result) - : "v" (__A), "v" (__B) - : ); + __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); #endif - return (__m128i) result; + return (__m128i)__result; #else - return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); + return (__m128i)vec_mule((__v4su)__A, (__v4su)__B); #endif } +#endif -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_slli_epi16 (__m128i __A, int __B) -{ - __v8hu lshift; - __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_epi16(__m128i __A, int __B) { + __v8hu __lshift; + __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; - if (__B >= 0 && __B < 16) - { - if (__builtin_constant_p(__B)) - lshift = (__v8hu) vec_splat_s16(__B); - else - lshift = vec_splats ((unsigned short) __B); + if (__B >= 0 && __B < 16) { + if (__builtin_constant_p(__B)) + __lshift = (__v8hu)vec_splat_s16(__B); + else + __lshift = vec_splats((unsigned short)__B); - result = vec_sl ((__v8hi) __A, lshift); - } + __result = vec_sl((__v8hi)__A, __lshift); + } - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_slli_epi32 (__m128i __A, int __B) -{ - __v4su lshift; - __v4si result = { 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_epi32(__m128i __A, int __B) { + __v4su __lshift; + __v4si __result = {0, 0, 0, 0}; - if (__B >= 0 && __B < 32) - { - if (__builtin_constant_p(__B) && __B < 16) - lshift = (__v4su) vec_splat_s32(__B); - else - lshift = vec_splats ((unsigned int) __B); + if (__B >= 0 && __B < 32) { + if (__builtin_constant_p(__B) && __B < 16) + __lshift = (__v4su)vec_splat_s32(__B); + else + __lshift = vec_splats((unsigned int)__B); - result = vec_sl ((__v4si) __A, lshift); - } + __result = vec_sl((__v4si)__A, __lshift); + } - return (__m128i) result; + return (__m128i)__result; } #ifdef _ARCH_PWR8 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_slli_epi64 (__m128i __A, int __B) -{ - __v2du lshift; - __v2di result = { 0, 0 }; - - if (__B >= 0 && __B < 64) - { - if (__builtin_constant_p(__B) && __B < 16) - lshift = (__v2du) vec_splat_s32(__B); - else - lshift = (__v2du) vec_splats ((unsigned int) __B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_epi64(__m128i __A, int __B) { + __v2du __lshift; + __v2di __result = {0, 0}; + + if (__B >= 0 && __B < 64) { + if (__builtin_constant_p(__B) && __B < 16) + __lshift = (__v2du)vec_splat_s32(__B); + else + __lshift = (__v2du)vec_splats((unsigned int)__B); - result = vec_sl ((__v2di) __A, lshift); - } + __result = vec_sl((__v2di)__A, __lshift); + } - return (__m128i) result; + return (__m128i)__result; } #endif -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srai_epi16 (__m128i __A, int __B) -{ - __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; - __v8hi result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_epi16(__m128i __A, int __B) { + __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15}; + __v8hi __result; - if (__B < 16) - { - if (__builtin_constant_p(__B)) - rshift = (__v8hu) vec_splat_s16(__B); - else - rshift = vec_splats ((unsigned short) __B); - } - result = vec_sra ((__v8hi) __A, rshift); + if (__B < 16) { + if (__builtin_constant_p(__B)) + __rshift = (__v8hu)vec_splat_s16(__B); + else + __rshift = vec_splats((unsigned short)__B); + } + __result = vec_sra((__v8hi)__A, __rshift); - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srai_epi32 (__m128i __A, int __B) -{ - __v4su rshift = { 31, 31, 31, 31 }; - __v4si result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_epi32(__m128i __A, int __B) { + __v4su __rshift = {31, 31, 31, 31}; + __v4si __result; - if (__B < 32) - { - if (__builtin_constant_p(__B)) - { - if (__B < 16) - rshift = (__v4su) vec_splat_s32(__B); - else - rshift = (__v4su) vec_splats((unsigned int)__B); - } + if (__B < 32) { + if (__builtin_constant_p(__B)) { + if (__B < 16) + __rshift = (__v4su)vec_splat_s32(__B); else - rshift = vec_splats ((unsigned int) __B); - } - result = vec_sra ((__v4si) __A, rshift); + __rshift = (__v4su)vec_splats((unsigned int)__B); + } else + __rshift = vec_splats((unsigned int)__B); + } + __result = vec_sra((__v4si)__A, __rshift); - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_bslli_si128 (__m128i __A, const int __N) -{ - __v16qu result; - const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_bslli_si128(__m128i __A, const int __N) { + __v16qu __result; + const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; if (__N < 16) - result = vec_sld ((__v16qu) __A, zeros, __N); + __result = vec_sld((__v16qu)__A, __zeros, __N); else - result = zeros; + __result = __zeros; - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_bsrli_si128 (__m128i __A, const int __N) -{ - __v16qu result; - const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_bsrli_si128(__m128i __A, const int __N) { + __v16qu __result; + const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; if (__N < 16) #ifdef __LITTLE_ENDIAN__ if (__builtin_constant_p(__N)) /* Would like to use Vector Shift Left Double by Octet - Immediate here to use the immediate form and avoid - load of __N * 8 value into a separate VR. */ - result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); + Immediate here to use the immediate form and avoid + load of __N * 8 value into a separate VR. */ + __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N)); else #endif - { - __v16qu shift = vec_splats((unsigned char)(__N*8)); + { + __v16qu __shift = vec_splats((unsigned char)(__N * 8)); #ifdef __LITTLE_ENDIAN__ - result = vec_sro ((__v16qu)__A, shift); + __result = vec_sro((__v16qu)__A, __shift); #else - result = vec_slo ((__v16qu)__A, shift); + __result = vec_slo((__v16qu)__A, __shift); #endif - } + } else - result = zeros; + __result = __zeros; - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srli_si128 (__m128i __A, const int __N) -{ - return _mm_bsrli_si128 (__A, __N); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_si128(__m128i __A, const int __N) { + return _mm_bsrli_si128(__A, __N); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_slli_si128 (__m128i __A, const int _imm5) -{ - __v16qu result; - const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_si128(__m128i __A, const int _imm5) { + __v16qu __result; + const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; if (_imm5 < 16) #ifdef __LITTLE_ENDIAN__ - result = vec_sld ((__v16qu) __A, zeros, _imm5); + __result = vec_sld((__v16qu)__A, __zeros, _imm5); #else - result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); + __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5)); #endif else - result = zeros; + __result = __zeros; - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srli_epi16 (__m128i __A, int __B) -{ - __v8hu rshift; - __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; + _mm_srli_epi16(__m128i __A, int __B) { + __v8hu __rshift; + __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; - if (__B < 16) - { - if (__builtin_constant_p(__B)) - rshift = (__v8hu) vec_splat_s16(__B); - else - rshift = vec_splats ((unsigned short) __B); + if (__B < 16) { + if (__builtin_constant_p(__B)) + __rshift = (__v8hu)vec_splat_s16(__B); + else + __rshift = vec_splats((unsigned short)__B); - result = vec_sr ((__v8hi) __A, rshift); - } + __result = vec_sr((__v8hi)__A, __rshift); + } - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srli_epi32 (__m128i __A, int __B) -{ - __v4su rshift; - __v4si result = { 0, 0, 0, 0 }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_epi32(__m128i __A, int __B) { + __v4su __rshift; + __v4si __result = {0, 0, 0, 0}; - if (__B < 32) - { - if (__builtin_constant_p(__B)) - { - if (__B < 16) - rshift = (__v4su) vec_splat_s32(__B); - else - rshift = (__v4su) vec_splats((unsigned int)__B); - } + if (__B < 32) { + if (__builtin_constant_p(__B)) { + if (__B < 16) + __rshift = (__v4su)vec_splat_s32(__B); else - rshift = vec_splats ((unsigned int) __B); + __rshift = (__v4su)vec_splats((unsigned int)__B); + } else + __rshift = vec_splats((unsigned int)__B); - result = vec_sr ((__v4si) __A, rshift); - } + __result = vec_sr((__v4si)__A, __rshift); + } - return (__m128i) result; + return (__m128i)__result; } #ifdef _ARCH_PWR8 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srli_epi64 (__m128i __A, int __B) -{ - __v2du rshift; - __v2di result = { 0, 0 }; - - if (__B < 64) - { - if (__builtin_constant_p(__B)) - { - if (__B < 16) - rshift = (__v2du) vec_splat_s32(__B); - else - rshift = (__v2du) vec_splats((unsigned long long)__B); - } +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_epi64(__m128i __A, int __B) { + __v2du __rshift; + __v2di __result = {0, 0}; + + if (__B < 64) { + if (__builtin_constant_p(__B)) { + if (__B < 16) + __rshift = (__v2du)vec_splat_s32(__B); else - rshift = (__v2du) vec_splats ((unsigned int) __B); + __rshift = (__v2du)vec_splats((unsigned long long)__B); + } else + __rshift = (__v2du)vec_splats((unsigned int)__B); - result = vec_sr ((__v2di) __A, rshift); - } + __result = vec_sr((__v2di)__A, __rshift); + } - return (__m128i) result; + return (__m128i)__result; } #endif -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sll_epi16 (__m128i __A, __m128i __B) -{ - __v8hu lshift; - __vector __bool short shmask; - const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; - __v8hu result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_epi16(__m128i __A, __m128i __B) { + __v8hu __lshift; + __vector __bool short __shmask; + const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; + __v8hu __result; #ifdef __LITTLE_ENDIAN__ - lshift = vec_splat ((__v8hu) __B, 0); + __lshift = vec_splat((__v8hu)__B, 0); #else - lshift = vec_splat ((__v8hu) __B, 3); + __lshift = vec_splat((__v8hu)__B, 3); #endif - shmask = vec_cmple (lshift, shmax); - result = vec_sl ((__v8hu) __A, lshift); - result = vec_sel ((__v8hu) shmask, result, shmask); + __shmask = vec_cmple(__lshift, __shmax); + __result = vec_sl((__v8hu)__A, __lshift); + __result = vec_sel((__v8hu)__shmask, __result, __shmask); - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sll_epi32 (__m128i __A, __m128i __B) -{ - __v4su lshift; - __vector __bool int shmask; - const __v4su shmax = { 32, 32, 32, 32 }; - __v4su result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_epi32(__m128i __A, __m128i __B) { + __v4su __lshift; + __vector __bool int __shmask; + const __v4su __shmax = {32, 32, 32, 32}; + __v4su __result; #ifdef __LITTLE_ENDIAN__ - lshift = vec_splat ((__v4su) __B, 0); + __lshift = vec_splat((__v4su)__B, 0); #else - lshift = vec_splat ((__v4su) __B, 1); + __lshift = vec_splat((__v4su)__B, 1); #endif - shmask = vec_cmplt (lshift, shmax); - result = vec_sl ((__v4su) __A, lshift); - result = vec_sel ((__v4su) shmask, result, shmask); + __shmask = vec_cmplt(__lshift, __shmax); + __result = vec_sl((__v4su)__A, __lshift); + __result = vec_sel((__v4su)__shmask, __result, __shmask); - return (__m128i) result; + return (__m128i)__result; } #ifdef _ARCH_PWR8 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sll_epi64 (__m128i __A, __m128i __B) -{ - __v2du lshift; - __vector __bool long long shmask; - const __v2du shmax = { 64, 64 }; - __v2du result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_epi64(__m128i __A, __m128i __B) { + __v2du __lshift; + __vector __bool long long __shmask; + const __v2du __shmax = {64, 64}; + __v2du __result; - lshift = vec_splat ((__v2du) __B, 0); - shmask = vec_cmplt (lshift, shmax); - result = vec_sl ((__v2du) __A, lshift); - result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); + __lshift = vec_splat((__v2du)__B, 0); + __shmask = vec_cmplt(__lshift, __shmax); + __result = vec_sl((__v2du)__A, __lshift); + __result = vec_sel((__v2du)__shmask, __result, __shmask); - return (__m128i) result; + return (__m128i)__result; } #endif -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sra_epi16 (__m128i __A, __m128i __B) -{ - const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; - __v8hu rshift; - __v8hi result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_epi16(__m128i __A, __m128i __B) { + const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15}; + __v8hu __rshift; + __v8hi __result; #ifdef __LITTLE_ENDIAN__ - rshift = vec_splat ((__v8hu)__B, 0); + __rshift = vec_splat((__v8hu)__B, 0); #else - rshift = vec_splat ((__v8hu)__B, 3); + __rshift = vec_splat((__v8hu)__B, 3); #endif - rshift = vec_min (rshift, rshmax); - result = vec_sra ((__v8hi) __A, rshift); + __rshift = vec_min(__rshift, __rshmax); + __result = vec_sra((__v8hi)__A, __rshift); - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sra_epi32 (__m128i __A, __m128i __B) -{ - const __v4su rshmax = { 31, 31, 31, 31 }; - __v4su rshift; - __v4si result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_epi32(__m128i __A, __m128i __B) { + const __v4su __rshmax = {31, 31, 31, 31}; + __v4su __rshift; + __v4si __result; #ifdef __LITTLE_ENDIAN__ - rshift = vec_splat ((__v4su)__B, 0); + __rshift = vec_splat((__v4su)__B, 0); #else - rshift = vec_splat ((__v4su)__B, 1); + __rshift = vec_splat((__v4su)__B, 1); #endif - rshift = vec_min (rshift, rshmax); - result = vec_sra ((__v4si) __A, rshift); + __rshift = vec_min(__rshift, __rshmax); + __result = vec_sra((__v4si)__A, __rshift); - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srl_epi16 (__m128i __A, __m128i __B) -{ - __v8hu rshift; - __vector __bool short shmask; - const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; - __v8hu result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_epi16(__m128i __A, __m128i __B) { + __v8hu __rshift; + __vector __bool short __shmask; + const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; + __v8hu __result; #ifdef __LITTLE_ENDIAN__ - rshift = vec_splat ((__v8hu) __B, 0); + __rshift = vec_splat((__v8hu)__B, 0); #else - rshift = vec_splat ((__v8hu) __B, 3); + __rshift = vec_splat((__v8hu)__B, 3); #endif - shmask = vec_cmple (rshift, shmax); - result = vec_sr ((__v8hu) __A, rshift); - result = vec_sel ((__v8hu) shmask, result, shmask); + __shmask = vec_cmple(__rshift, __shmax); + __result = vec_sr((__v8hu)__A, __rshift); + __result = vec_sel((__v8hu)__shmask, __result, __shmask); - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srl_epi32 (__m128i __A, __m128i __B) -{ - __v4su rshift; - __vector __bool int shmask; - const __v4su shmax = { 32, 32, 32, 32 }; - __v4su result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_epi32(__m128i __A, __m128i __B) { + __v4su __rshift; + __vector __bool int __shmask; + const __v4su __shmax = {32, 32, 32, 32}; + __v4su __result; #ifdef __LITTLE_ENDIAN__ - rshift = vec_splat ((__v4su) __B, 0); + __rshift = vec_splat((__v4su)__B, 0); #else - rshift = vec_splat ((__v4su) __B, 1); + __rshift = vec_splat((__v4su)__B, 1); #endif - shmask = vec_cmplt (rshift, shmax); - result = vec_sr ((__v4su) __A, rshift); - result = vec_sel ((__v4su) shmask, result, shmask); + __shmask = vec_cmplt(__rshift, __shmax); + __result = vec_sr((__v4su)__A, __rshift); + __result = vec_sel((__v4su)__shmask, __result, __shmask); - return (__m128i) result; + return (__m128i)__result; } #ifdef _ARCH_PWR8 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srl_epi64 (__m128i __A, __m128i __B) -{ - __v2du rshift; - __vector __bool long long shmask; - const __v2du shmax = { 64, 64 }; - __v2du result; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_epi64(__m128i __A, __m128i __B) { + __v2du __rshift; + __vector __bool long long __shmask; + const __v2du __shmax = {64, 64}; + __v2du __result; - rshift = vec_splat ((__v2du) __B, 0); - shmask = vec_cmplt (rshift, shmax); - result = vec_sr ((__v2du) __A, rshift); - result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); + __rshift = vec_splat((__v2du)__B, 0); + __shmask = vec_cmplt(__rshift, __shmax); + __result = vec_sr((__v2du)__A, __rshift); + __result = vec_sel((__v2du)__shmask, __result, __shmask); - return (__m128i) result; + return (__m128i)__result; } #endif -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_and_pd (__m128d __A, __m128d __B) -{ - return (vec_and ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_pd(__m128d __A, __m128d __B) { + return (vec_and((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_andnot_pd (__m128d __A, __m128d __B) -{ - return (vec_andc ((__v2df) __B, (__v2df) __A)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_pd(__m128d __A, __m128d __B) { + return (vec_andc((__v2df)__B, (__v2df)__A)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_or_pd (__m128d __A, __m128d __B) -{ - return (vec_or ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_pd(__m128d __A, __m128d __B) { + return (vec_or((__v2df)__A, (__v2df)__B)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_xor_pd (__m128d __A, __m128d __B) -{ - return (vec_xor ((__v2df) __A, (__v2df) __B)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_pd(__m128d __A, __m128d __B) { + return (vec_xor((__v2df)__A, (__v2df)__B)); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_and_si128 (__m128i __A, __m128i __B) -{ - return (__m128i)vec_and ((__v2di) __A, (__v2di) __B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_si128(__m128i __A, __m128i __B) { + return (__m128i)vec_and((__v2di)__A, (__v2di)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_andnot_si128 (__m128i __A, __m128i __B) -{ - return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_si128(__m128i __A, __m128i __B) { + return (__m128i)vec_andc((__v2di)__B, (__v2di)__A); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_or_si128 (__m128i __A, __m128i __B) -{ - return (__m128i)vec_or ((__v2di) __A, (__v2di) __B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_si128(__m128i __A, __m128i __B) { + return (__m128i)vec_or((__v2di)__A, (__v2di)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_xor_si128 (__m128i __A, __m128i __B) -{ - return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_si128(__m128i __A, __m128i __B) { + return (__m128i)vec_xor((__v2di)__A, (__v2di)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi8(__m128i __A, __m128i __B) { + return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi32(__m128i __A, __m128i __B) { + return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi8(__m128i __A, __m128i __B) { + return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi32(__m128i __A, __m128i __B) { + return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi8(__m128i __A, __m128i __B) { + return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi32(__m128i __A, __m128i __B) { + return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_extract_epi16 (__m128i const __A, int const __N) -{ - return (unsigned short) ((__v8hi)__A)[__N & 7]; +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi16(__m128i const __A, int const __N) { + return (unsigned short)((__v8hi)__A)[__N & 7]; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) -{ - __v8hi result = (__v8hi)__A; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi16(__m128i const __A, int const __D, int const __N) { + __v8hi __result = (__v8hi)__A; - result [(__N & 7)] = __D; + __result[(__N & 7)] = __D; - return (__m128i) result; + return (__m128i)__result; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_epu8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epu8(__m128i __A, __m128i __B) { + return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epi16(__m128i __A, __m128i __B) { + return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_epu8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epu8(__m128i __A, __m128i __B) { + return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B); } - #ifdef _ARCH_PWR8 /* Intrinsic functions that require PowerISA 2.07 minimum. */ -/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movemask_epi8 (__m128i __A) -{ - __vector unsigned long long result; - static const __vector unsigned char perm_mask = - { - 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, - 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 - }; +/* Return a mask created from the most significant bit of each 8-bit + element in A. */ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_epi8(__m128i __A) { +#ifdef _ARCH_PWR10 + return vec_extractm((__v16qu)__A); +#else + __vector unsigned long long __result; + static const __vector unsigned char __perm_mask = { + 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, + 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; - result = ((__vector unsigned long long) - vec_vbpermq ((__vector unsigned char) __A, - (__vector unsigned char) perm_mask)); + __result = ((__vector unsigned long long)vec_vbpermq( + (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); #ifdef __LITTLE_ENDIAN__ - return result[1]; + return __result[1]; #else - return result[0]; + return __result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mulhi_epu16 (__m128i __A, __m128i __B) -{ - __v4su w0, w1; - __v16qu xform1 = { +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_epu16(__m128i __A, __m128i __B) { + __v4su __w0, __w1; + __v16qu __xform1 = { #ifdef __LITTLE_ENDIAN__ - 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, - 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F + 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, + 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F #else - 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, - 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, + 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D #endif - }; + }; - w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); - w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); - return (__m128i) vec_perm (w0, w1, xform1); + __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B); + __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B); + return (__m128i)vec_perm(__w0, __w1, __xform1); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_shufflehi_epi16 (__m128i __A, const int __mask) -{ - unsigned long element_selector_98 = __mask & 0x03; - unsigned long element_selector_BA = (__mask >> 2) & 0x03; - unsigned long element_selector_DC = (__mask >> 4) & 0x03; - unsigned long element_selector_FE = (__mask >> 6) & 0x03; - static const unsigned short permute_selectors[4] = - { +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shufflehi_epi16(__m128i __A, const int __mask) { + unsigned long __element_selector_98 = __mask & 0x03; + unsigned long __element_selector_BA = (__mask >> 2) & 0x03; + unsigned long __element_selector_DC = (__mask >> 4) & 0x03; + unsigned long __element_selector_FE = (__mask >> 6) & 0x03; + static const unsigned short __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ - 0x0908, 0x0B0A, 0x0D0C, 0x0F0E + 0x0908, 0x0B0A, 0x0D0C, 0x0F0E #else - 0x0809, 0x0A0B, 0x0C0D, 0x0E0F + 0x0809, 0x0A0B, 0x0C0D, 0x0E0F #endif - }; - __v2du pmask = + }; + __v2du __pmask = #ifdef __LITTLE_ENDIAN__ - { 0x1716151413121110UL, 0UL}; + {0x1716151413121110UL, 0UL}; #else - { 0x1011121314151617UL, 0UL}; + {0x1011121314151617UL, 0UL}; #endif - __m64_union t; - __v2du a, r; - - t.as_short[0] = permute_selectors[element_selector_98]; - t.as_short[1] = permute_selectors[element_selector_BA]; - t.as_short[2] = permute_selectors[element_selector_DC]; - t.as_short[3] = permute_selectors[element_selector_FE]; - pmask[1] = t.as_m64; - a = (__v2du)__A; - r = vec_perm (a, a, (__vector unsigned char)pmask); - return (__m128i) r; -} - -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_shufflelo_epi16 (__m128i __A, const int __mask) -{ - unsigned long element_selector_10 = __mask & 0x03; - unsigned long element_selector_32 = (__mask >> 2) & 0x03; - unsigned long element_selector_54 = (__mask >> 4) & 0x03; - unsigned long element_selector_76 = (__mask >> 6) & 0x03; - static const unsigned short permute_selectors[4] = - { + __m64_union __t; + __v2du __a, __r; + + __t.as_short[0] = __permute_selectors[__element_selector_98]; + __t.as_short[1] = __permute_selectors[__element_selector_BA]; + __t.as_short[2] = __permute_selectors[__element_selector_DC]; + __t.as_short[3] = __permute_selectors[__element_selector_FE]; + __pmask[1] = __t.as_m64; + __a = (__v2du)__A; + __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); + return (__m128i)__r; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shufflelo_epi16(__m128i __A, const int __mask) { + unsigned long __element_selector_10 = __mask & 0x03; + unsigned long __element_selector_32 = (__mask >> 2) & 0x03; + unsigned long __element_selector_54 = (__mask >> 4) & 0x03; + unsigned long __element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned short __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ - 0x0100, 0x0302, 0x0504, 0x0706 + 0x0100, 0x0302, 0x0504, 0x0706 #else - 0x0001, 0x0203, 0x0405, 0x0607 + 0x0001, 0x0203, 0x0405, 0x0607 #endif - }; - __v2du pmask = + }; + __v2du __pmask = #ifdef __LITTLE_ENDIAN__ - { 0UL, 0x1f1e1d1c1b1a1918UL}; + {0UL, 0x1f1e1d1c1b1a1918UL}; #else - { 0UL, 0x18191a1b1c1d1e1fUL}; + {0UL, 0x18191a1b1c1d1e1fUL}; #endif - __m64_union t; - __v2du a, r; - t.as_short[0] = permute_selectors[element_selector_10]; - t.as_short[1] = permute_selectors[element_selector_32]; - t.as_short[2] = permute_selectors[element_selector_54]; - t.as_short[3] = permute_selectors[element_selector_76]; - pmask[0] = t.as_m64; - a = (__v2du)__A; - r = vec_perm (a, a, (__vector unsigned char)pmask); - return (__m128i) r; -} - -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_shuffle_epi32 (__m128i __A, const int __mask) -{ - unsigned long element_selector_10 = __mask & 0x03; - unsigned long element_selector_32 = (__mask >> 2) & 0x03; - unsigned long element_selector_54 = (__mask >> 4) & 0x03; - unsigned long element_selector_76 = (__mask >> 6) & 0x03; - static const unsigned int permute_selectors[4] = - { + __m64_union __t; + __v2du __a, __r; + __t.as_short[0] = __permute_selectors[__element_selector_10]; + __t.as_short[1] = __permute_selectors[__element_selector_32]; + __t.as_short[2] = __permute_selectors[__element_selector_54]; + __t.as_short[3] = __permute_selectors[__element_selector_76]; + __pmask[0] = __t.as_m64; + __a = (__v2du)__A; + __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); + return (__m128i)__r; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_epi32(__m128i __A, const int __mask) { + unsigned long __element_selector_10 = __mask & 0x03; + unsigned long __element_selector_32 = (__mask >> 2) & 0x03; + unsigned long __element_selector_54 = (__mask >> 4) & 0x03; + unsigned long __element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned int __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ - 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C #else 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F #endif - }; - __v4su t; - - t[0] = permute_selectors[element_selector_10]; - t[1] = permute_selectors[element_selector_32]; - t[2] = permute_selectors[element_selector_54] + 0x10101010; - t[3] = permute_selectors[element_selector_76] + 0x10101010; - return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); -} - -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) -{ - __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; - __v16qu mask, tmp; - __m128i_u *p = (__m128i_u*)__C; - - tmp = (__v16qu)_mm_loadu_si128(p); - mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit); - tmp = vec_sel (tmp, (__v16qu)__A, mask); - _mm_storeu_si128 (p, (__m128i)tmp); -} - -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_avg_epu8 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B); -} - -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_avg_epu16 (__m128i __A, __m128i __B) -{ - return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B); -} - - -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sad_epu8 (__m128i __A, __m128i __B) -{ - __v16qu a, b; - __v16qu vmin, vmax, vabsdiff; - __v4si vsum; - const __v4su zero = { 0, 0, 0, 0 }; - __v4si result; - - a = (__v16qu) __A; - b = (__v16qu) __B; - vmin = vec_min (a, b); - vmax = vec_max (a, b); - vabsdiff = vec_sub (vmax, vmin); + }; + __v4su __t; + + __t[0] = __permute_selectors[__element_selector_10]; + __t[1] = __permute_selectors[__element_selector_32]; + __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; + __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; + return (__m128i)vec_perm((__v4si)__A, (__v4si)__A, + (__vector unsigned char)__t); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { + __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; + __v16qu __mask, __tmp; + __m128i_u *__p = (__m128i_u *)__C; + + __tmp = (__v16qu)_mm_loadu_si128(__p); + __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit); + __tmp = vec_sel(__tmp, (__v16qu)__A, __mask); + _mm_storeu_si128(__p, (__m128i)__tmp); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_epu8(__m128i __A, __m128i __B) { + return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_epu16(__m128i __A, __m128i __B) { + return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sad_epu8(__m128i __A, __m128i __B) { + __v16qu __a, __b; + __v16qu __vabsdiff; + __v4si __vsum; + const __v4su __zero = {0, 0, 0, 0}; + __v4si __result; + + __a = (__v16qu)__A; + __b = (__v16qu)__B; +#ifndef _ARCH_PWR9 + __v16qu __vmin = vec_min(__a, __b); + __v16qu __vmax = vec_max(__a, __b); + __vabsdiff = vec_sub(__vmax, __vmin); +#else + __vabsdiff = vec_absd(__a, __b); +#endif /* Sum four groups of bytes into integers. */ - vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); - /* Sum across four integers with two integer results. */ - result = vec_sum2s (vsum, (__vector signed int) zero); - /* Rotate the sums into the correct position. */ + __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero); #ifdef __LITTLE_ENDIAN__ - result = vec_sld (result, result, 4); + /* Sum across four integers with two integer results. */ + __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero)); + /* Note: vec_sum2s could be used here, but on little-endian, vector + shifts are added that are not needed for this use-case. + A vector shift to correctly position the 32-bit integer results + (currently at [0] and [2]) to [1] and [3] would then need to be + swapped back again since the desired results are two 64-bit + integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ #else - result = vec_sld (result, result, 6); -#endif + /* Sum across four integers with two integer results. */ + __result = vec_sum2s(__vsum, (__vector signed int)__zero); /* Rotate the sums into the correct position. */ - return (__m128i) result; + __result = vec_sld(__result, __result, 6); +#endif + return (__m128i)__result; } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_stream_si32 (int *__A, int __B) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_si32(int *__A, int __B) { /* Use the data cache block touch for store transient. */ - __asm__ ( - "dcbtstt 0,%0" - : - : "b" (__A) - : "memory" - ); + __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); *__A = __B; } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_stream_si64 (long long int *__A, long long int __B) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_si64(long long int *__A, long long int __B) { /* Use the data cache block touch for store transient. */ - __asm__ ( - " dcbtstt 0,%0" - : - : "b" (__A) - : "memory" - ); + __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory"); *__A = __B; } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_stream_si128 (__m128i *__A, __m128i __B) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_si128(__m128i *__A, __m128i __B) { /* Use the data cache block touch for store transient. */ - __asm__ ( - "dcbtstt 0,%0" - : - : "b" (__A) - : "memory" - ); + __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); *__A = __B; } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_stream_pd (double *__A, __m128d __B) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_pd(double *__A, __m128d __B) { /* Use the data cache block touch for store transient. */ - __asm__ ( - "dcbtstt 0,%0" - : - : "b" (__A) - : "memory" - ); - *(__m128d*)__A = __B; -} - -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_clflush (void const *__A) -{ + __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); + *(__m128d *)__A = __B; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_clflush(void const *__A) { /* Use the data cache block flush. */ - __asm__ ( - "dcbf 0,%0" - : - : "b" (__A) - : "memory" - ); + __asm__("dcbf 0,%0" : : "b"(__A) : "memory"); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_lfence (void) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_lfence(void) { /* Use light weight sync for load to load ordering. */ - __atomic_thread_fence (__ATOMIC_RELEASE); + __atomic_thread_fence(__ATOMIC_RELEASE); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mfence (void) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mfence(void) { /* Use heavy weight sync for any to any ordering. */ - __atomic_thread_fence (__ATOMIC_SEQ_CST); + __atomic_thread_fence(__ATOMIC_SEQ_CST); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi32_si128 (int __A) -{ - return _mm_set_epi32 (0, 0, 0, __A); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_si128(int __A) { + return _mm_set_epi32(0, 0, 0, __A); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi64_si128 (long long __A) -{ - return __extension__ (__m128i)(__v2di){ __A, 0LL }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_si128(long long __A) { + return __extension__(__m128i)(__v2di){__A, 0LL}; } /* Microsoft intrinsic. */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi64x_si128 (long long __A) -{ - return __extension__ (__m128i)(__v2di){ __A, 0LL }; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_si128(long long __A) { + return __extension__(__m128i)(__v2di){__A, 0LL}; } /* Casts between various SP, DP, INT vector types. Note that these do no conversion of values, they just change the type. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_castpd_ps(__m128d __A) -{ - return (__m128) __A; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_ps(__m128d __A) { + return (__m128)__A; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_castpd_si128(__m128d __A) -{ - return (__m128i) __A; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_si128(__m128d __A) { + return (__m128i)__A; } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_castps_pd(__m128 __A) -{ - return (__m128d) __A; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_pd(__m128 __A) { + return (__m128d)__A; } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_castps_si128(__m128 __A) -{ - return (__m128i) __A; +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_si128(__m128 __A) { + return (__m128i)__A; } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_castsi128_ps(__m128i __A) -{ - return (__m128) __A; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_ps(__m128i __A) { + return (__m128)__A; } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_castsi128_pd(__m128i __A) -{ - return (__m128d) __A; +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_pd(__m128i __A) { + return (__m128d)__A; } #else #include_next <emmintrin.h> -#endif /* defined(__linux__) && defined(__ppc64__) */ +#endif /* defined(__powerpc64__) && \ + * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ #endif /* EMMINTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/immintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/immintrin.h new file mode 100644 index 000000000000..c1ada9889d4a --- /dev/null +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/immintrin.h @@ -0,0 +1,27 @@ +/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef IMMINTRIN_H_ +#define IMMINTRIN_H_ + +#include <x86gprintrin.h> + +#include <mmintrin.h> + +#include <xmmintrin.h> + +#include <emmintrin.h> + +#include <pmmintrin.h> + +#include <tmmintrin.h> + +#include <smmintrin.h> + +#endif /* IMMINTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mm_malloc.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mm_malloc.h index 24b14c8e07c0..7c1e625e44d5 100644 --- a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mm_malloc.h +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mm_malloc.h @@ -10,38 +10,33 @@ #ifndef _MM_MALLOC_H_INCLUDED #define _MM_MALLOC_H_INCLUDED -#if defined(__linux__) && defined(__ppc64__) +#if defined(__powerpc64__) && \ + (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) #include <stdlib.h> /* We can't depend on <stdlib.h> since the prototype of posix_memalign may not be visible. */ #ifndef __cplusplus -extern int posix_memalign (void **, size_t, size_t); +extern int posix_memalign(void **, size_t, size_t); #else -extern "C" int posix_memalign (void **, size_t, size_t) throw (); +extern "C" int posix_memalign(void **, size_t, size_t); #endif -static __inline void * -_mm_malloc (size_t size, size_t alignment) -{ +static __inline void *_mm_malloc(size_t __size, size_t __alignment) { /* PowerPC64 ELF V2 ABI requires quadword alignment. */ - size_t vec_align = sizeof (__vector float); - void *ptr; + size_t __vec_align = sizeof(__vector float); + void *__ptr; - if (alignment < vec_align) - alignment = vec_align; - if (posix_memalign (&ptr, alignment, size) == 0) - return ptr; + if (__alignment < __vec_align) + __alignment = __vec_align; + if (posix_memalign(&__ptr, __alignment, __size) == 0) + return __ptr; else return NULL; } -static __inline void -_mm_free (void * ptr) -{ - free (ptr); -} +static __inline void _mm_free(void *__ptr) { free(__ptr); } #else #include_next <mm_malloc.h> diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mmintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mmintrin.h index c55c44726f00..0be3af2b0bd7 100644 --- a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mmintrin.h +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mmintrin.h @@ -35,7 +35,8 @@ #ifndef _MMINTRIN_H_INCLUDED #define _MMINTRIN_H_INCLUDED -#if defined(__linux__) && defined(__ppc64__) +#if defined(__powerpc64__) && \ + (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) #include <altivec.h> /* The Intel API is flexible enough that we must allow aliasing with other @@ -149,17 +150,17 @@ extern __inline long long extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pi16(__m64 __m1, __m64 __m2) { - __vector signed short vm1; - __vector signed char vresult; + __vector signed short __vm1; + __vector signed char __vresult; - vm1 = (__vector signed short)(__vector unsigned long long) + __vm1 = (__vector signed short)(__vector unsigned long long) #ifdef __LITTLE_ENDIAN__ {__m1, __m2}; #else {__m2, __m1}; #endif - vresult = vec_packs(vm1, vm1); - return (__m64)((__vector long long)vresult)[0]; + __vresult = vec_packs(__vm1, __vm1); + return (__m64)((__vector long long)__vresult)[0]; } extern __inline __m64 @@ -174,17 +175,17 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pi32(__m64 __m1, __m64 __m2) { - __vector signed int vm1; - __vector signed short vresult; + __vector signed int __vm1; + __vector signed short __vresult; - vm1 = (__vector signed int)(__vector unsigned long long) + __vm1 = (__vector signed int)(__vector unsigned long long) #ifdef __LITTLE_ENDIAN__ {__m1, __m2}; #else {__m2, __m1}; #endif - vresult = vec_packs(vm1, vm1); - return (__m64)((__vector long long)vresult)[0]; + __vresult = vec_packs(__vm1, __vm1); + return (__m64)((__vector long long)__vresult)[0]; } extern __inline __m64 @@ -199,19 +200,20 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pu16(__m64 __m1, __m64 __m2) { - __vector unsigned char r; - __vector signed short vm1 = (__vector signed short)(__vector long long) + __vector unsigned char __r; + __vector signed short __vm1 = (__vector signed short)(__vector long long) #ifdef __LITTLE_ENDIAN__ {__m1, __m2}; #else {__m2, __m1}; #endif const __vector signed short __zero = {0}; - __vector __bool short __select = vec_cmplt(vm1, __zero); - r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1); - __vector __bool char packsel = vec_pack(__select, __select); - r = vec_sel(r, (const __vector unsigned char)__zero, packsel); - return (__m64)((__vector long long)r)[0]; + __vector __bool short __select = vec_cmplt(__vm1, __zero); + __r = + vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1); + __vector __bool char __packsel = vec_pack(__select, __select); + __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel); + return (__m64)((__vector long long)__r)[0]; } extern __inline __m64 @@ -227,28 +229,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats(__m1); - b = (__vector unsigned char)vec_splats(__m2); - c = vec_mergel(a, b); - return (__m64)((__vector long long)c)[1]; + __a = (__vector unsigned char)vec_splats(__m1); + __b = (__vector unsigned char)vec_splats(__m2); + __c = vec_mergel(__a, __b); + return (__m64)((__vector long long)__c)[1]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = m1.as_char[4]; - res.as_char[1] = m2.as_char[4]; - res.as_char[2] = m1.as_char[5]; - res.as_char[3] = m2.as_char[5]; - res.as_char[4] = m1.as_char[6]; - res.as_char[5] = m2.as_char[6]; - res.as_char[6] = m1.as_char[7]; - res.as_char[7] = m2.as_char[7]; + __res.as_char[0] = __mu1.as_char[4]; + __res.as_char[1] = __mu2.as_char[4]; + __res.as_char[2] = __mu1.as_char[5]; + __res.as_char[3] = __mu2.as_char[5]; + __res.as_char[4] = __mu1.as_char[6]; + __res.as_char[5] = __mu2.as_char[6]; + __res.as_char[6] = __mu1.as_char[7]; + __res.as_char[7] = __mu2.as_char[7]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -263,17 +265,17 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = m1.as_short[2]; - res.as_short[1] = m2.as_short[2]; - res.as_short[2] = m1.as_short[3]; - res.as_short[3] = m2.as_short[3]; + __res.as_short[0] = __mu1.as_short[2]; + __res.as_short[1] = __mu2.as_short[2]; + __res.as_short[2] = __mu1.as_short[3]; + __res.as_short[3] = __mu2.as_short[3]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; } extern __inline __m64 @@ -286,15 +288,15 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = m1.as_int[1]; - res.as_int[1] = m2.as_int[1]; + __res.as_int[0] = __mu1.as_int[1]; + __res.as_int[1] = __mu2.as_int[1]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; } extern __inline __m64 @@ -308,28 +310,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats(__m1); - b = (__vector unsigned char)vec_splats(__m2); - c = vec_mergel(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector unsigned char)vec_splats(__m1); + __b = (__vector unsigned char)vec_splats(__m2); + __c = vec_mergel(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = m1.as_char[0]; - res.as_char[1] = m2.as_char[0]; - res.as_char[2] = m1.as_char[1]; - res.as_char[3] = m2.as_char[1]; - res.as_char[4] = m1.as_char[2]; - res.as_char[5] = m2.as_char[2]; - res.as_char[6] = m1.as_char[3]; - res.as_char[7] = m2.as_char[3]; + __res.as_char[0] = __mu1.as_char[0]; + __res.as_char[1] = __mu2.as_char[0]; + __res.as_char[2] = __mu1.as_char[1]; + __res.as_char[3] = __mu2.as_char[1]; + __res.as_char[4] = __mu1.as_char[2]; + __res.as_char[5] = __mu2.as_char[2]; + __res.as_char[6] = __mu1.as_char[3]; + __res.as_char[7] = __mu2.as_char[3]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -343,17 +345,17 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = m1.as_short[0]; - res.as_short[1] = m2.as_short[0]; - res.as_short[2] = m1.as_short[1]; - res.as_short[3] = m2.as_short[1]; + __res.as_short[0] = __mu1.as_short[0]; + __res.as_short[1] = __mu2.as_short[0]; + __res.as_short[2] = __mu1.as_short[1]; + __res.as_short[3] = __mu2.as_short[1]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; } extern __inline __m64 @@ -367,15 +369,15 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = m1.as_int[0]; - res.as_int[1] = m2.as_int[0]; + __res.as_int[0] = __mu1.as_int[0]; + __res.as_int[1] = __mu2.as_int[0]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; } extern __inline __m64 @@ -389,28 +391,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi8(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats(__m1); - b = (__vector signed char)vec_splats(__m2); - c = vec_add(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed char)vec_splats(__m1); + __b = (__vector signed char)vec_splats(__m2); + __c = vec_add(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = m1.as_char[0] + m2.as_char[0]; - res.as_char[1] = m1.as_char[1] + m2.as_char[1]; - res.as_char[2] = m1.as_char[2] + m2.as_char[2]; - res.as_char[3] = m1.as_char[3] + m2.as_char[3]; - res.as_char[4] = m1.as_char[4] + m2.as_char[4]; - res.as_char[5] = m1.as_char[5] + m2.as_char[5]; - res.as_char[6] = m1.as_char[6] + m2.as_char[6]; - res.as_char[7] = m1.as_char[7] + m2.as_char[7]; + __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; + __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; + __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; + __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; + __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; + __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; + __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; + __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -425,24 +427,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi16(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); - c = vec_add(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); + __c = vec_add(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = m1.as_short[0] + m2.as_short[0]; - res.as_short[1] = m1.as_short[1] + m2.as_short[1]; - res.as_short[2] = m1.as_short[2] + m2.as_short[2]; - res.as_short[3] = m1.as_short[3] + m2.as_short[3]; + __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; + __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; + __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; + __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -457,22 +459,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi32(__m64 __m1, __m64 __m2) { #if _ARCH_PWR9 - __vector signed int a, b, c; + __vector signed int __a, __b, __c; - a = (__vector signed int)vec_splats(__m1); - b = (__vector signed int)vec_splats(__m2); - c = vec_add(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed int)vec_splats(__m1); + __b = (__vector signed int)vec_splats(__m2); + __c = vec_add(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = m1.as_int[0] + m2.as_int[0]; - res.as_int[1] = m1.as_int[1] + m2.as_int[1]; + __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; + __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -487,28 +489,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi8(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats(__m1); - b = (__vector signed char)vec_splats(__m2); - c = vec_sub(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed char)vec_splats(__m1); + __b = (__vector signed char)vec_splats(__m2); + __c = vec_sub(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = m1.as_char[0] - m2.as_char[0]; - res.as_char[1] = m1.as_char[1] - m2.as_char[1]; - res.as_char[2] = m1.as_char[2] - m2.as_char[2]; - res.as_char[3] = m1.as_char[3] - m2.as_char[3]; - res.as_char[4] = m1.as_char[4] - m2.as_char[4]; - res.as_char[5] = m1.as_char[5] - m2.as_char[5]; - res.as_char[6] = m1.as_char[6] - m2.as_char[6]; - res.as_char[7] = m1.as_char[7] - m2.as_char[7]; + __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; + __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; + __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; + __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; + __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; + __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; + __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; + __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -523,24 +525,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi16(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); - c = vec_sub(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); + __c = vec_sub(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = m1.as_short[0] - m2.as_short[0]; - res.as_short[1] = m1.as_short[1] - m2.as_short[1]; - res.as_short[2] = m1.as_short[2] - m2.as_short[2]; - res.as_short[3] = m1.as_short[3] - m2.as_short[3]; + __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; + __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; + __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; + __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -555,22 +557,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi32(__m64 __m1, __m64 __m2) { #if _ARCH_PWR9 - __vector signed int a, b, c; + __vector signed int __a, __b, __c; - a = (__vector signed int)vec_splats(__m1); - b = (__vector signed int)vec_splats(__m2); - c = vec_sub(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed int)vec_splats(__m1); + __b = (__vector signed int)vec_splats(__m2); + __c = vec_sub(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = m1.as_int[0] - m2.as_int[0]; - res.as_int[1] = m1.as_int[1] - m2.as_int[1]; + __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; + __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -708,25 +710,25 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { #if defined(_ARCH_PWR6) && defined(__powerpc64__) - __m64 res; - __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :); - return (res); + __m64 __res; + __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :); + return (__res); #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0; - res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0; - res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0; - res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0; - res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0; - res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0; - res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0; - res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0; + __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0; + __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0; + __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0; + __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0; + __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0; + __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0; + __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0; + __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -740,28 +742,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats(__m1); - b = (__vector signed char)vec_splats(__m2); - c = (__vector signed char)vec_cmpgt(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed char)vec_splats(__m1); + __b = (__vector signed char)vec_splats(__m2); + __c = (__vector signed char)vec_cmpgt(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0; - res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0; - res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0; - res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0; - res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0; - res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0; - res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0; - res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0; + __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0; + __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0; + __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0; + __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0; + __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0; + __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0; + __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0; + __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -777,24 +779,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); - c = (__vector signed short)vec_cmpeq(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); + __c = (__vector signed short)vec_cmpeq(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0; - res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0; - res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0; - res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0; + __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0; + __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0; + __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0; + __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -808,24 +810,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); - c = (__vector signed short)vec_cmpgt(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); + __c = (__vector signed short)vec_cmpgt(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0; - res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0; - res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0; - res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0; + __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0; + __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0; + __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0; + __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -841,22 +843,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { #if _ARCH_PWR9 - __vector signed int a, b, c; + __vector signed int __a, __b, __c; - a = (__vector signed int)vec_splats(__m1); - b = (__vector signed int)vec_splats(__m2); - c = (__vector signed int)vec_cmpeq(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed int)vec_splats(__m1); + __b = (__vector signed int)vec_splats(__m2); + __c = (__vector signed int)vec_cmpeq(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0; - res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0; + __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0; + __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -870,22 +872,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { #if _ARCH_PWR9 - __vector signed int a, b, c; + __vector signed int __a, __b, __c; - a = (__vector signed int)vec_splats(__m1); - b = (__vector signed int)vec_splats(__m2); - c = (__vector signed int)vec_cmpgt(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed int)vec_splats(__m1); + __b = (__vector signed int)vec_splats(__m2); + __c = (__vector signed int)vec_cmpgt(__a, __b); + return (__m64)((__vector long long)__c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0; - res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0; + __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0; + __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0; - return (__m64)res.as_m64; + return (__m64)__res.as_m64; #endif } @@ -901,12 +903,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pi8(__m64 __m1, __m64 __m2) { - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats(__m1); - b = (__vector signed char)vec_splats(__m2); - c = vec_adds(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed char)vec_splats(__m1); + __b = (__vector signed char)vec_splats(__m2); + __c = vec_adds(__a, __b); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -919,12 +921,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pi16(__m64 __m1, __m64 __m2) { - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); - c = vec_adds(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); + __c = vec_adds(__a, __b); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -937,12 +939,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu8(__m64 __m1, __m64 __m2) { - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats(__m1); - b = (__vector unsigned char)vec_splats(__m2); - c = vec_adds(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector unsigned char)vec_splats(__m1); + __b = (__vector unsigned char)vec_splats(__m2); + __c = vec_adds(__a, __b); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -956,12 +958,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu16(__m64 __m1, __m64 __m2) { - __vector unsigned short a, b, c; + __vector unsigned short __a, __b, __c; - a = (__vector unsigned short)vec_splats(__m1); - b = (__vector unsigned short)vec_splats(__m2); - c = vec_adds(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector unsigned short)vec_splats(__m1); + __b = (__vector unsigned short)vec_splats(__m2); + __c = vec_adds(__a, __b); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -975,12 +977,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pi8(__m64 __m1, __m64 __m2) { - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats(__m1); - b = (__vector signed char)vec_splats(__m2); - c = vec_subs(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed char)vec_splats(__m1); + __b = (__vector signed char)vec_splats(__m2); + __c = vec_subs(__a, __b); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -994,12 +996,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pi16(__m64 __m1, __m64 __m2) { - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); - c = vec_subs(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); + __c = vec_subs(__a, __b); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -1013,12 +1015,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pu8(__m64 __m1, __m64 __m2) { - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats(__m1); - b = (__vector unsigned char)vec_splats(__m2); - c = vec_subs(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector unsigned char)vec_splats(__m1); + __b = (__vector unsigned char)vec_splats(__m2); + __c = vec_subs(__a, __b); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -1032,12 +1034,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pu16(__m64 __m1, __m64 __m2) { - __vector unsigned short a, b, c; + __vector unsigned short __a, __b, __c; - a = (__vector unsigned short)vec_splats(__m1); - b = (__vector unsigned short)vec_splats(__m2); - c = vec_subs(a, b); - return (__m64)((__vector long long)c)[0]; + __a = (__vector unsigned short)vec_splats(__m1); + __b = (__vector unsigned short)vec_splats(__m2); + __c = vec_subs(__a, __b); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -1052,14 +1054,14 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_madd_pi16(__m64 __m1, __m64 __m2) { - __vector signed short a, b; - __vector signed int c; - __vector signed int zero = {0, 0, 0, 0}; + __vector signed short __a, __b; + __vector signed int __c; + __vector signed int __zero = {0, 0, 0, 0}; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); - c = vec_vmsumshm(a, b, zero); - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); + __c = vec_vmsumshm(__a, __b, __zero); + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -1072,10 +1074,10 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { - __vector signed short a, b; - __vector signed short c; - __vector signed int w0, w1; - __vector unsigned char xform1 = { + __vector signed short __a, __b; + __vector signed short __c; + __vector signed int __w0, __w1; + __vector unsigned char __xform1 = { #ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F @@ -1085,14 +1087,14 @@ extern __inline __m64 #endif }; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); - w0 = vec_vmulesh(a, b); - w1 = vec_vmulosh(a, b); - c = (__vector signed short)vec_perm(w0, w1, xform1); + __w0 = vec_vmulesh(__a, __b); + __w1 = vec_vmulosh(__a, __b); + __c = (__vector signed short)vec_perm(__w0, __w1, __xform1); - return (__m64)((__vector long long)c)[0]; + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -1106,12 +1108,12 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mullo_pi16(__m64 __m1, __m64 __m2) { - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats(__m1); - b = (__vector signed short)vec_splats(__m2); - c = a * b; - return (__m64)((__vector long long)c)[0]; + __a = (__vector signed short)vec_splats(__m1); + __b = (__vector signed short)vec_splats(__m2); + __c = __a * __b; + return (__m64)((__vector long long)__c)[0]; } extern __inline __m64 @@ -1124,14 +1126,14 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_pi16(__m64 __m, __m64 __count) { - __vector signed short m, r; - __vector unsigned short c; + __vector signed short __r; + __vector unsigned short __c; if (__count <= 15) { - m = (__vector signed short)vec_splats(__m); - c = (__vector unsigned short)vec_splats((unsigned short)__count); - r = vec_sl(m, (__vector unsigned short)c); - return (__m64)((__vector long long)r)[0]; + __r = (__vector signed short)vec_splats(__m); + __c = (__vector unsigned short)vec_splats((unsigned short)__count); + __r = vec_sl(__r, (__vector unsigned short)__c); + return (__m64)((__vector long long)__r)[0]; } else return (0); } @@ -1159,13 +1161,13 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_pi32(__m64 __m, __m64 __count) { - __m64_union m, res; + __m64_union __res; - m.as_m64 = __m; + __res.as_m64 = __m; - res.as_int[0] = m.as_int[0] << __count; - res.as_int[1] = m.as_int[1] << __count; - return (res.as_m64); + __res.as_int[0] = __res.as_int[0] << __count; + __res.as_int[1] = __res.as_int[1] << __count; + return (__res.as_m64); } extern __inline __m64 @@ -1191,14 +1193,14 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_pi16(__m64 __m, __m64 __count) { - __vector signed short m, r; - __vector unsigned short c; + __vector signed short __r; + __vector unsigned short __c; if (__count <= 15) { - m = (__vector signed short)vec_splats(__m); - c = (__vector unsigned short)vec_splats((unsigned short)__count); - r = vec_sra(m, (__vector unsigned short)c); - return (__m64)((__vector long long)r)[0]; + __r = (__vector signed short)vec_splats(__m); + __c = (__vector unsigned short)vec_splats((unsigned short)__count); + __r = vec_sra(__r, (__vector unsigned short)__c); + return (__m64)((__vector long long)__r)[0]; } else return (0); } @@ -1226,13 +1228,13 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_pi32(__m64 __m, __m64 __count) { - __m64_union m, res; + __m64_union __res; - m.as_m64 = __m; + __res.as_m64 = __m; - res.as_int[0] = m.as_int[0] >> __count; - res.as_int[1] = m.as_int[1] >> __count; - return (res.as_m64); + __res.as_int[0] = __res.as_int[0] >> __count; + __res.as_int[1] = __res.as_int[1] >> __count; + return (__res.as_m64); } extern __inline __m64 @@ -1258,14 +1260,14 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_pi16(__m64 __m, __m64 __count) { - __vector unsigned short m, r; - __vector unsigned short c; + __vector unsigned short __r; + __vector unsigned short __c; if (__count <= 15) { - m = (__vector unsigned short)vec_splats(__m); - c = (__vector unsigned short)vec_splats((unsigned short)__count); - r = vec_sr(m, (__vector unsigned short)c); - return (__m64)((__vector long long)r)[0]; + __r = (__vector unsigned short)vec_splats(__m); + __c = (__vector unsigned short)vec_splats((unsigned short)__count); + __r = vec_sr(__r, (__vector unsigned short)__c); + return (__m64)((__vector long long)__r)[0]; } else return (0); } @@ -1293,13 +1295,13 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_pi32(__m64 __m, __m64 __count) { - __m64_union m, res; + __m64_union __res; - m.as_m64 = __m; + __res.as_m64 = __m; - res.as_int[0] = (unsigned int)m.as_int[0] >> __count; - res.as_int[1] = (unsigned int)m.as_int[1] >> __count; - return (res.as_m64); + __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; + __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; + return (__res.as_m64); } extern __inline __m64 @@ -1326,24 +1328,24 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi32(int __i1, int __i0) { - __m64_union res; + __m64_union __res; - res.as_int[0] = __i0; - res.as_int[1] = __i1; - return (res.as_m64); + __res.as_int[0] = __i0; + __res.as_int[1] = __i1; + return (__res.as_m64); } /* Creates a vector of four 16-bit values; W0 is least significant. */ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { - __m64_union res; + __m64_union __res; - res.as_short[0] = __w0; - res.as_short[1] = __w1; - res.as_short[2] = __w2; - res.as_short[3] = __w3; - return (res.as_m64); + __res.as_short[0] = __w0; + __res.as_short[1] = __w1; + __res.as_short[2] = __w2; + __res.as_short[3] = __w3; + return (__res.as_m64); } /* Creates a vector of eight 8-bit values; B0 is least significant. */ @@ -1351,28 +1353,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { - __m64_union res; + __m64_union __res; - res.as_char[0] = __b0; - res.as_char[1] = __b1; - res.as_char[2] = __b2; - res.as_char[3] = __b3; - res.as_char[4] = __b4; - res.as_char[5] = __b5; - res.as_char[6] = __b6; - res.as_char[7] = __b7; - return (res.as_m64); + __res.as_char[0] = __b0; + __res.as_char[1] = __b1; + __res.as_char[2] = __b2; + __res.as_char[3] = __b3; + __res.as_char[4] = __b4; + __res.as_char[5] = __b5; + __res.as_char[6] = __b6; + __res.as_char[7] = __b7; + return (__res.as_m64); } /* Similar, but with the arguments in reverse order. */ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pi32(int __i0, int __i1) { - __m64_union res; + __m64_union __res; - res.as_int[0] = __i0; - res.as_int[1] = __i1; - return (res.as_m64); + __res.as_int[0] = __i0; + __res.as_int[1] = __i1; + return (__res.as_m64); } extern __inline __m64 @@ -1392,11 +1394,11 @@ extern __inline __m64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi32(int __i) { - __m64_union res; + __m64_union __res; - res.as_int[0] = __i; - res.as_int[1] = __i; - return (res.as_m64); + __res.as_int[0] = __i; + __res.as_int[1] = __i; + return (__res.as_m64); } /* Creates a vector of four 16-bit values, all elements containing W. */ @@ -1409,13 +1411,13 @@ extern __inline __m64 w = (__vector signed short)vec_splats(__w); return (__m64)((__vector long long)w)[0]; #else - __m64_union res; + __m64_union __res; - res.as_short[0] = __w; - res.as_short[1] = __w; - res.as_short[2] = __w; - res.as_short[3] = __w; - return (res.as_m64); + __res.as_short[0] = __w; + __res.as_short[1] = __w; + __res.as_short[2] = __w; + __res.as_short[3] = __w; + return (__res.as_m64); #endif } @@ -1424,27 +1426,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi8(signed char __b) { #if _ARCH_PWR8 - __vector signed char b; + __vector signed char __res; - b = (__vector signed char)vec_splats(__b); - return (__m64)((__vector long long)b)[0]; + __res = (__vector signed char)vec_splats(__b); + return (__m64)((__vector long long)__res)[0]; #else - __m64_union res; - - res.as_char[0] = __b; - res.as_char[1] = __b; - res.as_char[2] = __b; - res.as_char[3] = __b; - res.as_char[4] = __b; - res.as_char[5] = __b; - res.as_char[6] = __b; - res.as_char[7] = __b; - return (res.as_m64); + __m64_union __res; + + __res.as_char[0] = __b; + __res.as_char[1] = __b; + __res.as_char[2] = __b; + __res.as_char[3] = __b; + __res.as_char[4] = __b; + __res.as_char[5] = __b; + __res.as_char[6] = __b; + __res.as_char[7] = __b; + return (__res.as_m64); #endif } #else #include_next <mmintrin.h> -#endif /* defined(__linux__) && defined(__ppc64__) */ +#endif /* defined(__powerpc64__) && \ + * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ #endif /* _MMINTRIN_H_INCLUDED */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/nmmintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/nmmintrin.h new file mode 100644 index 000000000000..789bba6bc0d3 --- /dev/null +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/nmmintrin.h @@ -0,0 +1,26 @@ +/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#endif + +#ifndef NMMINTRIN_H_ +#define NMMINTRIN_H_ + +/* We just include SSE4.1 header file. */ +#include <smmintrin.h> + +#endif /* NMMINTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/pmmintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/pmmintrin.h index 6d93383d5412..db128192abfb 100644 --- a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/pmmintrin.h +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/pmmintrin.h @@ -32,119 +32,114 @@ In the specific case of the monitor and mwait instructions there are no direct equivalent in the PowerISA at this time. So those intrinsics are not implemented. */ -#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." +#error \ + "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." #endif #ifndef PMMINTRIN_H_ #define PMMINTRIN_H_ -#if defined(__linux__) && defined(__ppc64__) +#if defined(__powerpc64__) && \ + (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) /* We need definitions from the SSE2 and SSE header files*/ #include <emmintrin.h> -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_addsub_ps (__m128 __X, __m128 __Y) -{ - const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0}; - __v4sf even_neg_Y = vec_xor(__Y, even_n0); - return (__m128) vec_add (__X, even_neg_Y); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_addsub_ps(__m128 __X, __m128 __Y) { + const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0}; + __v4sf __even_neg_Y = vec_xor(__Y, __even_n0); + return (__m128)vec_add(__X, __even_neg_Y); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_addsub_pd (__m128d __X, __m128d __Y) -{ - const __v2df even_n0 = {-0.0, 0.0}; - __v2df even_neg_Y = vec_xor(__Y, even_n0); - return (__m128d) vec_add (__X, even_neg_Y); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_addsub_pd(__m128d __X, __m128d __Y) { + const __v2df __even_n0 = {-0.0, 0.0}; + __v2df __even_neg_Y = vec_xor(__Y, __even_n0); + return (__m128d)vec_add(__X, __even_neg_Y); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hadd_ps (__m128 __X, __m128 __Y) -{ - __vector unsigned char xform2 = { - 0x00, 0x01, 0x02, 0x03, - 0x08, 0x09, 0x0A, 0x0B, - 0x10, 0x11, 0x12, 0x13, - 0x18, 0x19, 0x1A, 0x1B - }; - __vector unsigned char xform1 = { - 0x04, 0x05, 0x06, 0x07, - 0x0C, 0x0D, 0x0E, 0x0F, - 0x14, 0x15, 0x16, 0x17, - 0x1C, 0x1D, 0x1E, 0x1F - }; - return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), - vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_ps(__m128 __X, __m128 __Y) { + __vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, + 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B}; + __vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, + 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F}; + return (__m128)vec_add(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2), + vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hsub_ps (__m128 __X, __m128 __Y) -{ - __vector unsigned char xform2 = { - 0x00, 0x01, 0x02, 0x03, - 0x08, 0x09, 0x0A, 0x0B, - 0x10, 0x11, 0x12, 0x13, - 0x18, 0x19, 0x1A, 0x1B - }; - __vector unsigned char xform1 = { - 0x04, 0x05, 0x06, 0x07, - 0x0C, 0x0D, 0x0E, 0x0F, - 0x14, 0x15, 0x16, 0x17, - 0x1C, 0x1D, 0x1E, 0x1F - }; - return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), - vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_ps(__m128 __X, __m128 __Y) { + __vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, + 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B}; + __vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, + 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F}; + return (__m128)vec_sub(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2), + vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hadd_pd (__m128d __X, __m128d __Y) -{ - return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y), - vec_mergel ((__v2df) __X, (__v2df)__Y)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_pd(__m128d __X, __m128d __Y) { + return (__m128d)vec_add(vec_mergeh((__v2df)__X, (__v2df)__Y), + vec_mergel((__v2df)__X, (__v2df)__Y)); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hsub_pd (__m128d __X, __m128d __Y) -{ - return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y), - vec_mergel ((__v2df) __X, (__v2df)__Y)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_pd(__m128d __X, __m128d __Y) { + return (__m128d)vec_sub(vec_mergeh((__v2df)__X, (__v2df)__Y), + vec_mergel((__v2df)__X, (__v2df)__Y)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movehdup_ps (__m128 __X) -{ - return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); +#ifdef _ARCH_PWR8 +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movehdup_ps(__m128 __X) { + return (__m128)vec_mergeo((__v4su)__X, (__v4su)__X); } +#endif -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_moveldup_ps (__m128 __X) -{ - return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); +#ifdef _ARCH_PWR8 +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_moveldup_ps(__m128 __X) { + return (__m128)vec_mergee((__v4su)__X, (__v4su)__X); } +#endif -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loaddup_pd (double const *__P) -{ - return (__m128d) vec_splats (*__P); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loaddup_pd(double const *__P) { + return (__m128d)vec_splats(*__P); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movedup_pd (__m128d __X) -{ - return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movedup_pd(__m128d __X) { + return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0)); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_lddqu_si128 (__m128i const *__P) -{ - return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_lddqu_si128(__m128i const *__P) { + return (__m128i)(vec_vsx_ld(0, (signed int const *)__P)); } /* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */ #else #include_next <pmmintrin.h> -#endif /* defined(__linux__) && defined(__ppc64__) */ +#endif /* defined(__powerpc64__) && \ + * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ #endif /* PMMINTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/smmintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/smmintrin.h index 64f0c761994d..19cdecb18d2b 100644 --- a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/smmintrin.h +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/smmintrin.h @@ -14,7 +14,7 @@ #ifndef NO_WARN_X86_INTRINSICS /* This header is distributed to simplify porting x86_64 code that - makes explicit use of Intel intrinsics to powerp64/powerpc64le. + makes explicit use of Intel intrinsics to powerpc64/powerpc64le. It is the user's responsibility to determine if the results are acceptable and make additional changes as necessary. @@ -29,10 +29,273 @@ #ifndef SMMINTRIN_H_ #define SMMINTRIN_H_ -#if defined(__linux__) && defined(__ppc64__) +#if defined(__powerpc64__) && \ + (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) #include <altivec.h> -#include <emmintrin.h> +#include <tmmintrin.h> + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_ZERO 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_NEG_INF 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_round_pd(__m128d __A, int __rounding) { + __v2df __r; + union { + double __fr; + long long __fpscr; + } __enables_save, __fpscr_save; + + if (__rounding & _MM_FROUND_NO_EXC) { + /* Save enabled exceptions, disable all exceptions, + and preserve the rounding mode. */ +#ifdef _ARCH_PWR9 + __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; +#else + __fpscr_save.__fr = __builtin_ppc_mffs(); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; + __fpscr_save.__fpscr &= ~0xf8; + __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); +#endif + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__("" : "+wa"(__A)); + } + + switch (__rounding) { + case _MM_FROUND_TO_NEAREST_INT: +#ifdef _ARCH_PWR9 + __fpscr_save.__fr = __builtin_ppc_mffsl(); +#else + __fpscr_save.__fr = __builtin_ppc_mffs(); + __fpscr_save.__fpscr &= 0x70007f0ffL; +#endif + __attribute__((fallthrough)); + case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: + __builtin_ppc_set_fpscr_rn(0b00); + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__("" : "+wa"(__A)); + + __r = vec_rint((__v2df)__A); + + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__("" : : "wa"(__r)); + __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr); + break; + case _MM_FROUND_TO_NEG_INF: + case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: + __r = vec_floor((__v2df)__A); + break; + case _MM_FROUND_TO_POS_INF: + case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: + __r = vec_ceil((__v2df)__A); + break; + case _MM_FROUND_TO_ZERO: + case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: + __r = vec_trunc((__v2df)__A); + break; + case _MM_FROUND_CUR_DIRECTION: + __r = vec_rint((__v2df)__A); + break; + } + if (__rounding & _MM_FROUND_NO_EXC) { + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__("" : : "wa"(__r)); + /* Restore enabled exceptions. */ +#ifdef _ARCH_PWR9 + __fpscr_save.__fr = __builtin_ppc_mffsl(); +#else + __fpscr_save.__fr = __builtin_ppc_mffs(); + __fpscr_save.__fpscr &= 0x70007f0ffL; +#endif + __fpscr_save.__fpscr |= __enables_save.__fpscr; + __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); + } + return (__m128d)__r; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_round_sd(__m128d __A, __m128d __B, int __rounding) { + __B = _mm_round_pd(__B, __rounding); + __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]}; + return (__m128d)__r; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_round_ps(__m128 __A, int __rounding) { + __v4sf __r; + union { + double __fr; + long long __fpscr; + } __enables_save, __fpscr_save; + + if (__rounding & _MM_FROUND_NO_EXC) { + /* Save enabled exceptions, disable all exceptions, + and preserve the rounding mode. */ +#ifdef _ARCH_PWR9 + __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; +#else + __fpscr_save.__fr = __builtin_ppc_mffs(); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; + __fpscr_save.__fpscr &= ~0xf8; + __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); +#endif + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__("" : "+wa"(__A)); + } + + switch (__rounding) { + case _MM_FROUND_TO_NEAREST_INT: +#ifdef _ARCH_PWR9 + __fpscr_save.__fr = __builtin_ppc_mffsl(); +#else + __fpscr_save.__fr = __builtin_ppc_mffs(); + __fpscr_save.__fpscr &= 0x70007f0ffL; +#endif + __attribute__((fallthrough)); + case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: + __builtin_ppc_set_fpscr_rn(0b00); + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__("" : "+wa"(__A)); + + __r = vec_rint((__v4sf)__A); + + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__("" : : "wa"(__r)); + __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr); + break; + case _MM_FROUND_TO_NEG_INF: + case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: + __r = vec_floor((__v4sf)__A); + break; + case _MM_FROUND_TO_POS_INF: + case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: + __r = vec_ceil((__v4sf)__A); + break; + case _MM_FROUND_TO_ZERO: + case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: + __r = vec_trunc((__v4sf)__A); + break; + case _MM_FROUND_CUR_DIRECTION: + __r = vec_rint((__v4sf)__A); + break; + } + if (__rounding & _MM_FROUND_NO_EXC) { + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__("" : : "wa"(__r)); + /* Restore enabled exceptions. */ +#ifdef _ARCH_PWR9 + __fpscr_save.__fr = __builtin_ppc_mffsl(); +#else + __fpscr_save.__fr = __builtin_ppc_mffs(); + __fpscr_save.__fpscr &= 0x70007f0ffL; +#endif + __fpscr_save.__fpscr |= __enables_save.__fpscr; + __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); + } + return (__m128)__r; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_round_ss(__m128 __A, __m128 __B, int __rounding) { + __B = _mm_round_ps(__B, __rounding); + __v4sf __r = (__v4sf)__A; + __r[0] = ((__v4sf)__B)[0]; + return (__m128)__r; +} + +#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR) + +#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { + __v16qi __result = (__v16qi)__A; + + __result[__N & 0xf] = __D; + + return (__m128i)__result; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { + __v4si __result = (__v4si)__A; + + __result[__N & 3] = __D; + + return (__m128i)__result; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { + __v2di __result = (__v2di)__A; + + __result[__N & 1] = __D; + + return (__m128i)__result; +} extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -58,52 +321,363 @@ extern __inline int return ((__v4si)__X)[__N & 3]; } +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { - __v16qi __charmask = vec_splats((signed char)__imm8); + __v16qu __charmask = vec_splats((unsigned char)__imm8); __charmask = vec_gb(__charmask); - __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); + __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask); #ifdef __BIG_ENDIAN__ __shortmask = vec_reve(__shortmask); #endif return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); } +#endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { +#ifdef _ARCH_PWR10 + return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask); +#else const __v16qu __seven = vec_splats((unsigned char)0x07); __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); - return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask); + return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask); +#endif +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) { + __v16qu __pcv[] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, + {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, + {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, + {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, + {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, + {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, + {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, + {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, + {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, + {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, + {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, + }; + __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); + return (__m128)__r; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) { +#ifdef _ARCH_PWR10 + return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask); +#else + const __v4si __zero = {0}; + const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero); + return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask); +#endif +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) { + __v16qu __pcv[] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, + {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, + {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}}; + __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); + return (__m128d)__r; } +#ifdef _ARCH_PWR8 +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) { +#ifdef _ARCH_PWR10 + return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask); +#else + const __v2di __zero = {0}; + const __vector __bool long long __boolmask = + vec_cmplt((__v2di)__mask, __zero); + return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask); +#endif +} +#endif + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testz_si128(__m128i __A, __m128i __B) { + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testc_si128(__m128i __A, __m128i __B) { + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A); + return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testnzc_si128(__m128i __A, __m128i __B) { + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0; +} + +#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) + +#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) + +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) + +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { - __v16qi result = (__v16qi)__A; - result[__N & 0xf] = __D; - return (__m128i)result; + _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { + return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y); } +#endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { - __v4si result = (__v4si)__A; - result[__N & 3] = __D; - return (__m128i)result; + _mm_min_epi8(__m128i __X, __m128i __Y) { + return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { - __v2di result = (__v2di)__A; - result[__N & 1] = __D; - return (__m128i)result; + _mm_min_epu16(__m128i __X, __m128i __Y) { + return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epi32(__m128i __X, __m128i __Y) { + return (__m128i)vec_min((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epu32(__m128i __X, __m128i __Y) { + return (__m128i)vec_min((__v4su)__X, (__v4su)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epi8(__m128i __X, __m128i __Y) { + return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epu16(__m128i __X, __m128i __Y) { + return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epi32(__m128i __X, __m128i __Y) { + return (__m128i)vec_max((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epu32(__m128i __X, __m128i __Y) { + return (__m128i)vec_max((__v4su)__X, (__v4su)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mullo_epi32(__m128i __X, __m128i __Y) { + return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_epi32(__m128i __X, __m128i __Y) { + return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y); +} +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi8_epi16(__m128i __A) { + return (__m128i)vec_unpackh((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi8_epi32(__m128i __A) { + __A = (__m128i)vec_unpackh((__v16qi)__A); + return (__m128i)vec_unpackh((__v8hi)__A); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi8_epi64(__m128i __A) { + __A = (__m128i)vec_unpackh((__v16qi)__A); + __A = (__m128i)vec_unpackh((__v8hi)__A); + return (__m128i)vec_unpackh((__v4si)__A); +} +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi16_epi32(__m128i __A) { + return (__m128i)vec_unpackh((__v8hi)__A); } +#ifdef _ARCH_PWR8 +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi16_epi64(__m128i __A) { + __A = (__m128i)vec_unpackh((__v8hi)__A); + return (__m128i)vec_unpackh((__v4si)__A); +} +#endif + +#ifdef _ARCH_PWR8 +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi32_epi64(__m128i __A) { + return (__m128i)vec_unpackh((__v4si)__A); +} +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu8_epi16(__m128i __A) { + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu8_epi32(__m128i __A) { + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); + __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); + __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu8_epi64(__m128i __A) { + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); + __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); + __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); + __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); + __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu16_epi32(__m128i __A) { + const __v8hu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu16_epi64(__m128i __A) { + const __v8hu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); + __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); + __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu32_epi64(__m128i __A) { + const __v4su __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i)vec_mergeh((__v4su)__A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i)vec_mergeh(__zero, (__v4su)__A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +/* Return horizontal packed word minimum and its index in bits [15:0] + and bits [18:16] respectively. */ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_minpos_epu16(__m128i __A) { + union __u { + __m128i __m; + __v8hu __uh; + }; + union __u __u = {.__m = __A}, __r = {.__m = {0}}; + unsigned short __ridx = 0; + unsigned short __rmin = __u.__uh[__ridx]; + unsigned long __i; + for (__i = 1; __i < 8; __i++) { + if (__u.__uh[__i] < __rmin) { + __rmin = __u.__uh[__i]; + __ridx = __i; + } + } + __r.__uh[0] = __rmin; + __r.__uh[1] = __ridx; + return __r.__m; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packus_epi32(__m128i __X, __m128i __Y) { + return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { + return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y); +} +#endif + #else #include_next <smmintrin.h> -#endif /* defined(__linux__) && defined(__ppc64__) */ +#endif /* defined(__powerpc64__) && \ + * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ -#endif /* _SMMINTRIN_H_ */ +#endif /* SMMINTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/tmmintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/tmmintrin.h index b5a935d5e47e..92f08676d2df 100644 --- a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/tmmintrin.h +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/tmmintrin.h @@ -25,7 +25,8 @@ #ifndef TMMINTRIN_H_ #define TMMINTRIN_H_ -#if defined(__linux__) && defined(__ppc64__) +#if defined(__powerpc64__) && \ + (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) #include <altivec.h> @@ -33,463 +34,420 @@ #include <pmmintrin.h> extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_abs_epi16 (__m128i __A) -{ - return (__m128i) vec_abs ((__v8hi) __A); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_epi16(__m128i __A) { + return (__m128i)vec_abs((__v8hi)__A); } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_abs_epi32 (__m128i __A) -{ - return (__m128i) vec_abs ((__v4si) __A); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_epi32(__m128i __A) { + return (__m128i)vec_abs((__v4si)__A); } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_abs_epi8 (__m128i __A) -{ - return (__m128i) vec_abs ((__v16qi) __A); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_epi8(__m128i __A) { + return (__m128i)vec_abs((__v16qi)__A); } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_abs_pi16 (__m64 __A) -{ - __v8hi __B = (__v8hi) (__v2du) { __A, __A }; - return (__m64) ((__v2du) vec_abs (__B))[0]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_pi16(__m64 __A) { + __v8hi __B = (__v8hi)(__v2du){__A, __A}; + return (__m64)((__v2du)vec_abs(__B))[0]; } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_abs_pi32 (__m64 __A) -{ - __v4si __B = (__v4si) (__v2du) { __A, __A }; - return (__m64) ((__v2du) vec_abs (__B))[0]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_pi32(__m64 __A) { + __v4si __B = (__v4si)(__v2du){__A, __A}; + return (__m64)((__v2du)vec_abs(__B))[0]; } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_abs_pi8 (__m64 __A) -{ - __v16qi __B = (__v16qi) (__v2du) { __A, __A }; - return (__m64) ((__v2du) vec_abs (__B))[0]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_pi8(__m64 __A) { + __v16qi __B = (__v16qi)(__v2du){__A, __A}; + return (__m64)((__v2du)vec_abs(__B))[0]; } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) -{ - if (__builtin_constant_p (__count) && __count < 16) - { + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) { + if (__builtin_constant_p(__count) && __count < 16) { #ifdef __LITTLE_ENDIAN__ - __A = (__m128i) vec_reve ((__v16qu) __A); - __B = (__m128i) vec_reve ((__v16qu) __B); + __A = (__m128i)vec_reve((__v16qu)__A); + __B = (__m128i)vec_reve((__v16qu)__B); #endif - __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); + __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count); #ifdef __LITTLE_ENDIAN__ - __A = (__m128i) vec_reve ((__v16qu) __A); + __A = (__m128i)vec_reve((__v16qu)__A); #endif - return __A; - } + return __A; + } if (__count == 0) return __B; - if (__count >= 16) - { - if (__count >= 32) - { - const __v16qu zero = { 0 }; - return (__m128i) zero; - } - else - { - const __v16qu __shift = - vec_splats ((unsigned char) ((__count - 16) * 8)); + if (__count >= 16) { + if (__count >= 32) { + const __v16qu __zero = {0}; + return (__m128i)__zero; + } else { + const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8)); #ifdef __LITTLE_ENDIAN__ - return (__m128i) vec_sro ((__v16qu) __A, __shift); + return (__m128i)vec_sro((__v16qu)__A, __shift); #else - return (__m128i) vec_slo ((__v16qu) __A, __shift); + return (__m128i)vec_slo((__v16qu)__A, __shift); #endif - } } - else - { - const __v16qu __shiftA = - vec_splats ((unsigned char) ((16 - __count) * 8)); - const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); + } else { + const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8)); + const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8)); #ifdef __LITTLE_ENDIAN__ - __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); - __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); + __A = (__m128i)vec_slo((__v16qu)__A, __shiftA); + __B = (__m128i)vec_sro((__v16qu)__B, __shiftB); #else - __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); - __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); + __A = (__m128i)vec_sro((__v16qu)__A, __shiftA); + __B = (__m128i)vec_slo((__v16qu)__B, __shiftB); #endif - return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); - } + return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B); + } } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) -{ - if (__count < 16) - { - __v2du __C = { __B, __A }; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) { + if (__count < 16) { + __v2du __C = {__B, __A}; #ifdef __LITTLE_ENDIAN__ - const __v4su __shift = { __count << 3, 0, 0, 0 }; - __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); + const __v4su __shift = {__count << 3, 0, 0, 0}; + __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift); #else - const __v4su __shift = { 0, 0, 0, __count << 3 }; - __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); + const __v4su __shift = {0, 0, 0, __count << 3}; + __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift); #endif - return (__m64) __C[0]; - } - else - { - const __m64 __zero = { 0 }; - return __zero; - } + return (__m64)__C[0]; + } else { + const __m64 __zero = {0}; + return __zero; + } } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hadd_epi16 (__m128i __A, __m128i __B) -{ - const __v16qu __P = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; - const __v16qu __Q = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; - __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); - __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); - return (__m128i) vec_add (__C, __D); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_epi16(__m128i __A, __m128i __B) { + const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; + const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); + __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); + return (__m128i)vec_add(__C, __D); } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hadd_epi32 (__m128i __A, __m128i __B) -{ - const __v16qu __P = - { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; - const __v16qu __Q = - { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; - __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); - __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); - return (__m128i) vec_add (__C, __D); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_epi32(__m128i __A, __m128i __B) { + const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, + 16, 17, 18, 19, 24, 25, 26, 27}; + const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, + 20, 21, 22, 23, 28, 29, 30, 31}; + __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P); + __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q); + return (__m128i)vec_add(__C, __D); } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hadd_pi16 (__m64 __A, __m64 __B) -{ - __v8hi __C = (__v8hi) (__v2du) { __A, __B }; - const __v16qu __P = - { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; - const __v16qu __Q = - { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; - __v8hi __D = vec_perm (__C, __C, __Q); - __C = vec_perm (__C, __C, __P); - __C = vec_add (__C, __D); - return (__m64) ((__v2du) __C)[1]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_pi16(__m64 __A, __m64 __B) { + __v8hi __C = (__v8hi)(__v2du){__A, __B}; + const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; + const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; + __v8hi __D = vec_perm(__C, __C, __Q); + __C = vec_perm(__C, __C, __P); + __C = vec_add(__C, __D); + return (__m64)((__v2du)__C)[1]; } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hadd_pi32 (__m64 __A, __m64 __B) -{ - __v4si __C = (__v4si) (__v2du) { __A, __B }; - const __v16qu __P = - { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; - const __v16qu __Q = - { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; - __v4si __D = vec_perm (__C, __C, __Q); - __C = vec_perm (__C, __C, __P); - __C = vec_add (__C, __D); - return (__m64) ((__v2du) __C)[1]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_pi32(__m64 __A, __m64 __B) { + __v4si __C = (__v4si)(__v2du){__A, __B}; + const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11}; + const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15}; + __v4si __D = vec_perm(__C, __C, __Q); + __C = vec_perm(__C, __C, __P); + __C = vec_add(__C, __D); + return (__m64)((__v2du)__C)[1]; } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hadds_epi16 (__m128i __A, __m128i __B) -{ - __v4si __C = { 0 }, __D = { 0 }; - __C = vec_sum4s ((__v8hi) __A, __C); - __D = vec_sum4s ((__v8hi) __B, __D); - __C = (__v4si) vec_packs (__C, __D); - return (__m128i) __C; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadds_epi16(__m128i __A, __m128i __B) { + __v4si __C = {0}, __D = {0}; + __C = vec_sum4s((__v8hi)__A, __C); + __D = vec_sum4s((__v8hi)__B, __D); + __C = (__v4si)vec_packs(__C, __D); + return (__m128i)__C; } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hadds_pi16 (__m64 __A, __m64 __B) -{ - const __v4si __zero = { 0 }; - __v8hi __C = (__v8hi) (__v2du) { __A, __B }; - __v4si __D = vec_sum4s (__C, __zero); - __C = vec_packs (__D, __D); - return (__m64) ((__v2du) __C)[1]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadds_pi16(__m64 __A, __m64 __B) { + const __v4si __zero = {0}; + __v8hi __C = (__v8hi)(__v2du){__A, __B}; + __v4si __D = vec_sum4s(__C, __zero); + __C = vec_packs(__D, __D); + return (__m64)((__v2du)__C)[1]; } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hsub_epi16 (__m128i __A, __m128i __B) -{ - const __v16qu __P = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; - const __v16qu __Q = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; - __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); - __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); - return (__m128i) vec_sub (__C, __D); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_epi16(__m128i __A, __m128i __B) { + const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; + const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); + __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); + return (__m128i)vec_sub(__C, __D); } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hsub_epi32 (__m128i __A, __m128i __B) -{ - const __v16qu __P = - { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; - const __v16qu __Q = - { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; - __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); - __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); - return (__m128i) vec_sub (__C, __D); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_epi32(__m128i __A, __m128i __B) { + const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, + 16, 17, 18, 19, 24, 25, 26, 27}; + const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, + 20, 21, 22, 23, 28, 29, 30, 31}; + __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P); + __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q); + return (__m128i)vec_sub(__C, __D); } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hsub_pi16 (__m64 __A, __m64 __B) -{ - const __v16qu __P = - { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; - const __v16qu __Q = - { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; - __v8hi __C = (__v8hi) (__v2du) { __A, __B }; - __v8hi __D = vec_perm (__C, __C, __Q); - __C = vec_perm (__C, __C, __P); - __C = vec_sub (__C, __D); - return (__m64) ((__v2du) __C)[1]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_pi16(__m64 __A, __m64 __B) { + const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; + const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; + __v8hi __C = (__v8hi)(__v2du){__A, __B}; + __v8hi __D = vec_perm(__C, __C, __Q); + __C = vec_perm(__C, __C, __P); + __C = vec_sub(__C, __D); + return (__m64)((__v2du)__C)[1]; } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hsub_pi32 (__m64 __A, __m64 __B) -{ - const __v16qu __P = - { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; - const __v16qu __Q = - { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; - __v4si __C = (__v4si) (__v2du) { __A, __B }; - __v4si __D = vec_perm (__C, __C, __Q); - __C = vec_perm (__C, __C, __P); - __C = vec_sub (__C, __D); - return (__m64) ((__v2du) __C)[1]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_pi32(__m64 __A, __m64 __B) { + const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11}; + const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15}; + __v4si __C = (__v4si)(__v2du){__A, __B}; + __v4si __D = vec_perm(__C, __C, __Q); + __C = vec_perm(__C, __C, __P); + __C = vec_sub(__C, __D); + return (__m64)((__v2du)__C)[1]; } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hsubs_epi16 (__m128i __A, __m128i __B) -{ - const __v16qu __P = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; - const __v16qu __Q = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; - __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); - __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); - return (__m128i) vec_subs (__C, __D); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsubs_epi16(__m128i __A, __m128i __B) { + const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; + const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); + __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); + return (__m128i)vec_subs(__C, __D); } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_hsubs_pi16 (__m64 __A, __m64 __B) -{ - const __v16qu __P = - { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; - const __v16qu __Q = - { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; - __v8hi __C = (__v8hi) (__v2du) { __A, __B }; - __v8hi __D = vec_perm (__C, __C, __P); - __v8hi __E = vec_perm (__C, __C, __Q); - __C = vec_subs (__D, __E); - return (__m64) ((__v2du) __C)[1]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsubs_pi16(__m64 __A, __m64 __B) { + const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; + const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; + __v8hi __C = (__v8hi)(__v2du){__A, __B}; + __v8hi __D = vec_perm(__C, __C, __P); + __v8hi __E = vec_perm(__C, __C, __Q); + __C = vec_subs(__D, __E); + return (__m64)((__v2du)__C)[1]; } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_shuffle_epi8 (__m128i __A, __m128i __B) -{ - const __v16qi __zero = { 0 }; - __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); - __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); - return (__m128i) vec_sel (__C, __zero, __select); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_epi8(__m128i __A, __m128i __B) { + const __v16qi __zero = {0}; + __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero); + __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B); + return (__m128i)vec_sel(__C, __zero, __select); } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_shuffle_pi8 (__m64 __A, __m64 __B) -{ - const __v16qi __zero = { 0 }; - __v16qi __C = (__v16qi) (__v2du) { __A, __A }; - __v16qi __D = (__v16qi) (__v2du) { __B, __B }; - __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); - __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); - __C = vec_sel (__C, __zero, __select); - return (__m64) ((__v2du) (__C))[0]; -} - + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_pi8(__m64 __A, __m64 __B) { + const __v16qi __zero = {0}; + __v16qi __C = (__v16qi)(__v2du){__A, __A}; + __v16qi __D = (__v16qi)(__v2du){__B, __B}; + __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero); + __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D); + __C = vec_sel(__C, __zero, __select); + return (__m64)((__v2du)(__C))[0]; +} + +#ifdef _ARCH_PWR8 extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sign_epi8 (__m128i __A, __m128i __B) -{ - const __v16qi __zero = { 0 }; - __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_epi8(__m128i __A, __m128i __B) { + const __v16qi __zero = {0}; + __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero); __v16qi __selectpos = - (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); - __v16qi __conv = vec_add (__selectneg, __selectpos); - return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); + (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero)); + __v16qi __conv = vec_add(__selectneg, __selectpos); + return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sign_epi16 (__m128i __A, __m128i __B) -{ - const __v8hi __zero = { 0 }; - __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); - __v8hi __selectpos = - (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); - __v8hi __conv = vec_add (__selectneg, __selectpos); - return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_epi16(__m128i __A, __m128i __B) { + const __v8hi __zero = {0}; + __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero); + __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero)); + __v8hi __conv = vec_add(__selectneg, __selectpos); + return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sign_epi32 (__m128i __A, __m128i __B) -{ - const __v4si __zero = { 0 }; - __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); - __v4si __selectpos = - (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); - __v4si __conv = vec_add (__selectneg, __selectpos); - return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_epi32(__m128i __A, __m128i __B) { + const __v4si __zero = {0}; + __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero); + __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero)); + __v4si __conv = vec_add(__selectneg, __selectpos); + return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sign_pi8 (__m64 __A, __m64 __B) -{ - const __v16qi __zero = { 0 }; - __v16qi __C = (__v16qi) (__v2du) { __A, __A }; - __v16qi __D = (__v16qi) (__v2du) { __B, __B }; - __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); - return (__m64) ((__v2du) (__C))[0]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_pi8(__m64 __A, __m64 __B) { + const __v16qi __zero = {0}; + __v16qi __C = (__v16qi)(__v2du){__A, __A}; + __v16qi __D = (__v16qi)(__v2du){__B, __B}; + __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D); + return (__m64)((__v2du)(__C))[0]; } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sign_pi16 (__m64 __A, __m64 __B) -{ - const __v8hi __zero = { 0 }; - __v8hi __C = (__v8hi) (__v2du) { __A, __A }; - __v8hi __D = (__v8hi) (__v2du) { __B, __B }; - __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); - return (__m64) ((__v2du) (__C))[0]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_pi16(__m64 __A, __m64 __B) { + const __v8hi __zero = {0}; + __v8hi __C = (__v8hi)(__v2du){__A, __A}; + __v8hi __D = (__v8hi)(__v2du){__B, __B}; + __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D); + return (__m64)((__v2du)(__C))[0]; } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sign_pi32 (__m64 __A, __m64 __B) -{ - const __v4si __zero = { 0 }; - __v4si __C = (__v4si) (__v2du) { __A, __A }; - __v4si __D = (__v4si) (__v2du) { __B, __B }; - __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); - return (__m64) ((__v2du) (__C))[0]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_pi32(__m64 __A, __m64 __B) { + const __v4si __zero = {0}; + __v4si __C = (__v4si)(__v2du){__A, __A}; + __v4si __D = (__v4si)(__v2du){__B, __B}; + __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D); + return (__m64)((__v2du)(__C))[0]; } +#endif extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maddubs_epi16 (__m128i __A, __m128i __B) -{ - __v8hi __unsigned = vec_splats ((signed short) 0x00ff); - __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); - __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); - __v8hi __E = vec_unpackh ((__v16qi) __B); - __v8hi __F = vec_unpackl ((__v16qi) __B); - __C = vec_mul (__C, __E); - __D = vec_mul (__D, __F); - const __v16qu __odds = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; - const __v16qu __evens = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; - __E = vec_perm (__C, __D, __odds); - __F = vec_perm (__C, __D, __evens); - return (__m128i) vec_adds (__E, __F); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maddubs_epi16(__m128i __A, __m128i __B) { + __v8hi __unsigned = vec_splats((signed short)0x00ff); + __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned); + __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned); + __v8hi __E = vec_unpackh((__v16qi)__B); + __v8hi __F = vec_unpackl((__v16qi)__B); + __C = vec_mul(__C, __E); + __D = vec_mul(__D, __F); + const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; + const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + __E = vec_perm(__C, __D, __odds); + __F = vec_perm(__C, __D, __evens); + return (__m128i)vec_adds(__E, __F); } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maddubs_pi16 (__m64 __A, __m64 __B) -{ - __v8hi __C = (__v8hi) (__v2du) { __A, __A }; - __C = vec_unpackl ((__v16qi) __C); - const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); - __C = vec_and (__C, __unsigned); - __v8hi __D = (__v8hi) (__v2du) { __B, __B }; - __D = vec_unpackl ((__v16qi) __D); - __D = vec_mul (__C, __D); - const __v16qu __odds = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; - const __v16qu __evens = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; - __C = vec_perm (__D, __D, __odds); - __D = vec_perm (__D, __D, __evens); - __C = vec_adds (__C, __D); - return (__m64) ((__v2du) (__C))[0]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maddubs_pi16(__m64 __A, __m64 __B) { + __v8hi __C = (__v8hi)(__v2du){__A, __A}; + __C = vec_unpackl((__v16qi)__C); + const __v8hi __unsigned = vec_splats((signed short)0x00ff); + __C = vec_and(__C, __unsigned); + __v8hi __D = (__v8hi)(__v2du){__B, __B}; + __D = vec_unpackl((__v16qi)__D); + __D = vec_mul(__C, __D); + const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; + const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + __C = vec_perm(__D, __D, __odds); + __D = vec_perm(__D, __D, __evens); + __C = vec_adds(__C, __D); + return (__m64)((__v2du)(__C))[0]; } extern __inline __m128i -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mulhrs_epi16 (__m128i __A, __m128i __B) -{ - __v4si __C = vec_unpackh ((__v8hi) __A); - __v4si __D = vec_unpackh ((__v8hi) __B); - __C = vec_mul (__C, __D); - __D = vec_unpackl ((__v8hi) __A); - __v4si __E = vec_unpackl ((__v8hi) __B); - __D = vec_mul (__D, __E); - const __v4su __shift = vec_splats ((unsigned int) 14); - __C = vec_sr (__C, __shift); - __D = vec_sr (__D, __shift); - const __v4si __ones = vec_splats ((signed int) 1); - __C = vec_add (__C, __ones); - __C = vec_sr (__C, (__v4su) __ones); - __D = vec_add (__D, __ones); - __D = vec_sr (__D, (__v4su) __ones); - return (__m128i) vec_pack (__C, __D); + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhrs_epi16(__m128i __A, __m128i __B) { + __v4si __C = vec_unpackh((__v8hi)__A); + __v4si __D = vec_unpackh((__v8hi)__B); + __C = vec_mul(__C, __D); + __D = vec_unpackl((__v8hi)__A); + __v4si __E = vec_unpackl((__v8hi)__B); + __D = vec_mul(__D, __E); + const __v4su __shift = vec_splats((unsigned int)14); + __C = vec_sr(__C, __shift); + __D = vec_sr(__D, __shift); + const __v4si __ones = vec_splats((signed int)1); + __C = vec_add(__C, __ones); + __C = vec_sr(__C, (__v4su)__ones); + __D = vec_add(__D, __ones); + __D = vec_sr(__D, (__v4su)__ones); + return (__m128i)vec_pack(__C, __D); } extern __inline __m64 -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mulhrs_pi16 (__m64 __A, __m64 __B) -{ - __v4si __C = (__v4si) (__v2du) { __A, __A }; - __C = vec_unpackh ((__v8hi) __C); - __v4si __D = (__v4si) (__v2du) { __B, __B }; - __D = vec_unpackh ((__v8hi) __D); - __C = vec_mul (__C, __D); - const __v4su __shift = vec_splats ((unsigned int) 14); - __C = vec_sr (__C, __shift); - const __v4si __ones = vec_splats ((signed int) 1); - __C = vec_add (__C, __ones); - __C = vec_sr (__C, (__v4su) __ones); - __v8hi __E = vec_pack (__C, __D); - return (__m64) ((__v2du) (__E))[0]; + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhrs_pi16(__m64 __A, __m64 __B) { + __v4si __C = (__v4si)(__v2du){__A, __A}; + __C = vec_unpackh((__v8hi)__C); + __v4si __D = (__v4si)(__v2du){__B, __B}; + __D = vec_unpackh((__v8hi)__D); + __C = vec_mul(__C, __D); + const __v4su __shift = vec_splats((unsigned int)14); + __C = vec_sr(__C, __shift); + const __v4si __ones = vec_splats((signed int)1); + __C = vec_add(__C, __ones); + __C = vec_sr(__C, (__v4su)__ones); + __v8hi __E = vec_pack(__C, __D); + return (__m64)((__v2du)(__E))[0]; } #else #include_next <tmmintrin.h> -#endif /* defined(__linux__) && defined(__ppc64__) */ +#endif /* defined(__powerpc64__) && \ + * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ #endif /* TMMINTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/x86gprintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/x86gprintrin.h new file mode 100644 index 000000000000..cbfac262395c --- /dev/null +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/x86gprintrin.h @@ -0,0 +1,17 @@ +/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef X86GPRINTRIN_H_ +#define X86GPRINTRIN_H_ + +#include <bmiintrin.h> + +#include <bmi2intrin.h> + +#endif /* X86GPRINTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/x86intrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/x86intrin.h new file mode 100644 index 000000000000..f5c201262e69 --- /dev/null +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/x86intrin.h @@ -0,0 +1,28 @@ +/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef X86INTRIN_H_ +#define X86INTRIN_H_ + +#ifdef __ALTIVEC__ +#include <immintrin.h> +#endif /* __ALTIVEC__ */ + +#endif /* X86INTRIN_H_ */ diff --git a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/xmmintrin.h b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/xmmintrin.h index 0e45b96769f8..9dd21b65c2f7 100644 --- a/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/xmmintrin.h +++ b/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/xmmintrin.h @@ -28,25 +28,27 @@ Most SSE scalar float intrinsic operations can be performed more efficiently as C language float scalar operations or optimized to use vector SIMD operations. We recommend this for new applications. */ -#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#error \ + "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." #endif -#ifndef _XMMINTRIN_H_INCLUDED -#define _XMMINTRIN_H_INCLUDED +#ifndef XMMINTRIN_H_ +#define XMMINTRIN_H_ -#if defined(__linux__) && defined(__ppc64__) +#if defined(__powerpc64__) && \ + (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) /* Define four value permute mask */ -#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) +#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) #include <altivec.h> /* Avoid collisions between altivec.h and strict adherence to C++ and C11 standards. This should eventually be done inside altivec.h itself, but only after testing a full distro build. */ -#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \ - (defined(__STDC_VERSION__) && \ - __STDC_VERSION__ >= 201112L)) +#if defined(__STRICT_ANSI__) && \ + (defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)) #undef vector #undef pixel #undef bool @@ -71,145 +73,145 @@ typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1))); typedef vector float __v4sf; /* Create an undefined vector. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_undefined_ps (void) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_undefined_ps(void) { __m128 __Y = __Y; return __Y; } /* Create a vector of zeros. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setzero_ps (void) -{ - return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_ps(void) { + return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f}; } /* Load four SPFP values from P. The address must be 16-byte aligned. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load_ps (float const *__P) -{ - return ((__m128)vec_ld(0, (__v4sf*)__P)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ps(float const *__P) { + return ((__m128)vec_ld(0, (__v4sf *)__P)); } /* Load four SPFP values from P. The address need not be 16-byte aligned. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadu_ps (float const *__P) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadu_ps(float const *__P) { return (vec_vsx_ld(0, __P)); } /* Load four SPFP values in reverse order. The address must be aligned. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadr_ps (float const *__P) -{ - __v4sf __tmp; - __m128 result; - static const __vector unsigned char permute_vector = - { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, - 0x17, 0x10, 0x11, 0x12, 0x13 }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadr_ps(float const *__P) { + __v4sf __tmp; + __m128 __result; + static const __vector unsigned char __permute_vector = { + 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, + 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13}; - __tmp = vec_ld (0, (__v4sf *) __P); - result = (__m128) vec_perm (__tmp, __tmp, permute_vector); - return result; + __tmp = vec_ld(0, (__v4sf *)__P); + __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector); + return __result; } /* Create a vector with all four elements equal to F. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set1_ps (float __F) -{ - return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_ps(float __F) { + return __extension__(__m128)(__v4sf){__F, __F, __F, __F}; } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_ps1 (float __F) -{ - return _mm_set1_ps (__F); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_ps1(float __F) { + return _mm_set1_ps(__F); } /* Create the vector [Z Y X W]. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) -{ - return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) { + return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z}; } /* Create the vector [W X Y Z]. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setr_ps (float __Z, float __Y, float __X, float __W) -{ - return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_ps(float __Z, float __Y, float __X, float __W) { + return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W}; } /* Store four SPFP values. The address must be 16-byte aligned. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store_ps (float *__P, __m128 __A) -{ - vec_st((__v4sf)__A, 0, (__v4sf*)__P); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ps(float *__P, __m128 __A) { + vec_st((__v4sf)__A, 0, (__v4sf *)__P); } /* Store four SPFP values. The address need not be 16-byte aligned. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storeu_ps (float *__P, __m128 __A) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeu_ps(float *__P, __m128 __A) { *(__m128_u *)__P = __A; } /* Store four SPFP values in reverse order. The address must be aligned. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storer_ps (float *__P, __m128 __A) -{ - __v4sf __tmp; - static const __vector unsigned char permute_vector = - { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, - 0x17, 0x10, 0x11, 0x12, 0x13 }; +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storer_ps(float *__P, __m128 __A) { + __v4sf __tmp; + static const __vector unsigned char __permute_vector = { + 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, + 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13}; - __tmp = (__m128) vec_perm (__A, __A, permute_vector); + __tmp = (__m128)vec_perm(__A, __A, __permute_vector); - _mm_store_ps (__P, __tmp); + _mm_store_ps(__P, __tmp); } /* Store the lower SPFP value across four words. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store1_ps (float *__P, __m128 __A) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store1_ps(float *__P, __m128 __A) { __v4sf __va = vec_splat((__v4sf)__A, 0); - _mm_store_ps (__P, __va); + _mm_store_ps(__P, __va); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store_ps1 (float *__P, __m128 __A) -{ - _mm_store1_ps (__P, __A); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ps1(float *__P, __m128 __A) { + _mm_store1_ps(__P, __A); } /* Create a vector with element 0 as F and the rest zero. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_set_ss (float __F) -{ - return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_ss(float __F) { + return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f}; } /* Sets the low SPFP value of A from the low value of B. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_move_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_move_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; - return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask)); + return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask)); } /* Create a vector with element 0 as *P and the rest zero. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load_ss (float const *__P) -{ - return _mm_set_ss (*__P); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ss(float const *__P) { + return _mm_set_ss(*__P); } /* Stores the lower SPFP value. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store_ss (float *__P, __m128 __A) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ss(float *__P, __m128 __A) { *__P = ((__v4sf)__A)[0]; } @@ -217,612 +219,600 @@ _mm_store_ss (float *__P, __m128 __A) floating-point) values of A and B; the upper three SPFP values are passed through from A. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_ss (__m128 __A, __m128 __B) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_ss(__m128 __A, __m128 __B) { #ifdef _ARCH_PWR7 - __m128 a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = a + b; + __a = vec_splat(__A, 0); + __b = vec_splat(__B, 0); + __c = __a + __b; /* Then we merge the lower float result with the original upper float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel(__A, __c, __mask)); #else __A[0] = __A[0] + __B[0]; return (__A); #endif } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_ss (__m128 __A, __m128 __B) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_ss(__m128 __A, __m128 __B) { #ifdef _ARCH_PWR7 - __m128 a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = a - b; + __a = vec_splat(__A, 0); + __b = vec_splat(__B, 0); + __c = __a - __b; /* Then we merge the lower float result with the original upper float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel(__A, __c, __mask)); #else __A[0] = __A[0] - __B[0]; return (__A); #endif } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mul_ss (__m128 __A, __m128 __B) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_ss(__m128 __A, __m128 __B) { #ifdef _ARCH_PWR7 - __m128 a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = a * b; + __a = vec_splat(__A, 0); + __b = vec_splat(__B, 0); + __c = __a * __b; /* Then we merge the lower float result with the original upper float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel(__A, __c, __mask)); #else __A[0] = __A[0] * __B[0]; return (__A); #endif } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_div_ss (__m128 __A, __m128 __B) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_ss(__m128 __A, __m128 __B) { #ifdef _ARCH_PWR7 - __m128 a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = a / b; + __a = vec_splat(__A, 0); + __b = vec_splat(__B, 0); + __c = __a / __b; /* Then we merge the lower float result with the original upper float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel(__A, __c, __mask)); #else __A[0] = __A[0] / __B[0]; return (__A); #endif } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sqrt_ss (__m128 __A) -{ - __m128 a, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_ss(__m128 __A) { + __m128 __a, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) * results. So to insure we don't generate spurious exceptions * (from the upper double values) we splat the lower double * before we to the operation. */ - a = vec_splat (__A, 0); - c = vec_sqrt (a); + __a = vec_splat(__A, 0); + __c = vec_sqrt(__a); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel(__A, __c, __mask)); } /* Perform the respective operation on the four SPFP values in A and B. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_ps (__m128 __A, __m128 __B) -{ - return (__m128) ((__v4sf)__A + (__v4sf)__B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_ps(__m128 __A, __m128 __B) { + return (__m128)((__v4sf)__A + (__v4sf)__B); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_ps (__m128 __A, __m128 __B) -{ - return (__m128) ((__v4sf)__A - (__v4sf)__B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_ps(__m128 __A, __m128 __B) { + return (__m128)((__v4sf)__A - (__v4sf)__B); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mul_ps (__m128 __A, __m128 __B) -{ - return (__m128) ((__v4sf)__A * (__v4sf)__B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_ps(__m128 __A, __m128 __B) { + return (__m128)((__v4sf)__A * (__v4sf)__B); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_div_ps (__m128 __A, __m128 __B) -{ - return (__m128) ((__v4sf)__A / (__v4sf)__B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_ps(__m128 __A, __m128 __B) { + return (__m128)((__v4sf)__A / (__v4sf)__B); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sqrt_ps (__m128 __A) -{ - return (vec_sqrt ((__v4sf)__A)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_ps(__m128 __A) { + return (vec_sqrt((__v4sf)__A)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rcp_ps (__m128 __A) -{ - return (vec_re ((__v4sf)__A)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp_ps(__m128 __A) { + return (vec_re((__v4sf)__A)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rsqrt_ps (__m128 __A) -{ - return (vec_rsqrte (__A)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt_ps(__m128 __A) { + return (vec_rsqrte(__A)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rcp_ss (__m128 __A) -{ - __m128 a, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp_ss(__m128 __A) { + __m128 __a, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) * results. So to insure we don't generate spurious exceptions * (from the upper double values) we splat the lower double * before we to the operation. */ - a = vec_splat (__A, 0); - c = _mm_rcp_ps (a); + __a = vec_splat(__A, 0); + __c = _mm_rcp_ps(__a); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel(__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rsqrt_ss (__m128 __A) -{ - __m128 a, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt_ss(__m128 __A) { + __m128 __a, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) * results. So to insure we don't generate spurious exceptions * (from the upper double values) we splat the lower double * before we to the operation. */ - a = vec_splat (__A, 0); - c = vec_rsqrte (a); + __a = vec_splat(__A, 0); + __c = vec_rsqrte(__a); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel(__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_ss (__m128 __A, __m128 __B) -{ - __v4sf a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_ss(__m128 __A, __m128 __B) { + __v4sf __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower float) * results. So to insure we don't generate spurious exceptions * (from the upper float values) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf)__A, 0); - b = vec_splat ((__v4sf)__B, 0); - c = vec_min (a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = vec_min(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel ((__v4sf)__A, c, mask)); + return (vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_ss (__m128 __A, __m128 __B) -{ - __v4sf a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_ss(__m128 __A, __m128 __B) { + __v4sf __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower float) * results. So to insure we don't generate spurious exceptions * (from the upper float values) we splat the lower float * before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = vec_max (a, b); + __a = vec_splat(__A, 0); + __b = vec_splat(__B, 0); + __c = vec_max(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel ((__v4sf)__A, c, mask)); + return (vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_ps (__m128 __A, __m128 __B) -{ - __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A); - return vec_sel (__B, __A, m); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_ps(__m128 __A, __m128 __B) { + __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A); + return vec_sel(__B, __A, __m); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_ps (__m128 __A, __m128 __B) -{ - __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B); - return vec_sel (__B, __A, m); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_ps(__m128 __A, __m128 __B) { + __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B); + return vec_sel(__B, __A, __m); } /* Perform logical bit-wise operations on 128-bit values. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_and_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B)); -// return __builtin_ia32_andps (__A, __B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B)); + // return __builtin_ia32_andps (__A, __B); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_andnot_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_or_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_xor_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B)); } /* Perform a comparison on the four SPFP values of A and B. For each element, if the comparison is true, place a mask of all ones in the result, otherwise a mask of zeros. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmple_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpge_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpneq_ps (__m128 __A, __m128 __B) -{ - __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); - return ((__m128)vec_nor (temp, temp)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_ps(__m128 __A, __m128 __B) { + __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B); + return ((__m128)vec_nor(__temp, __temp)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnlt_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnle_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpngt_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnge_ps (__m128 __A, __m128 __B) -{ - return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_ps(__m128 __A, __m128 __B) { + return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpord_ps (__m128 __A, __m128 __B) -{ - __vector unsigned int a, b; - __vector unsigned int c, d; - static const __vector unsigned int float_exp_mask = - { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_ps(__m128 __A, __m128 __B) { + __vector unsigned int __a, __b; + __vector unsigned int __c, __d; + static const __vector unsigned int __float_exp_mask = { + 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; - a = (__vector unsigned int) vec_abs ((__v4sf)__A); - b = (__vector unsigned int) vec_abs ((__v4sf)__B); - c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); - d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); - return ((__m128 ) vec_and (c, d)); + __a = (__vector unsigned int)vec_abs((__v4sf)__A); + __b = (__vector unsigned int)vec_abs((__v4sf)__B); + __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a); + __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b); + return ((__m128)vec_and(__c, __d)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpunord_ps (__m128 __A, __m128 __B) -{ - __vector unsigned int a, b; - __vector unsigned int c, d; - static const __vector unsigned int float_exp_mask = - { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_ps(__m128 __A, __m128 __B) { + __vector unsigned int __a, __b; + __vector unsigned int __c, __d; + static const __vector unsigned int __float_exp_mask = { + 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; - a = (__vector unsigned int) vec_abs ((__v4sf)__A); - b = (__vector unsigned int) vec_abs ((__v4sf)__B); - c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); - d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); - return ((__m128 ) vec_or (c, d)); + __a = (__vector unsigned int)vec_abs((__v4sf)__A); + __b = (__vector unsigned int)vec_abs((__v4sf)__B); + __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask); + __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask); + return ((__m128)vec_or(__c, __d)); } /* Perform a comparison on the lower SPFP values of A and B. If the comparison is true, place a mask of all ones in the result, otherwise a mask of zeros. The upper three SPFP values are passed through from A. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpeq(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmpeq(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmplt(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmplt(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmple_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmple(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmple(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpgt(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmpgt(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpge_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpge(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmpge(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpneq_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpeq(a, b); - c = vec_nor (c, c); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmpeq(__a, __b); + __c = vec_nor(__c, __c); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnlt_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpge(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmpge(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnle_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpgt(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmpgt(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpngt_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmple(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmple(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpnge_ss (__m128 __A, __m128 __B) -{ - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_ss(__m128 __A, __m128 __B) { + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we do the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmplt(a, b); + __a = vec_splat((__v4sf)__A, 0); + __b = vec_splat((__v4sf)__B, 0); + __c = (__v4sf)vec_cmplt(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); -} - -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpord_ss (__m128 __A, __m128 __B) -{ - __vector unsigned int a, b; - __vector unsigned int c, d; - static const __vector unsigned int float_exp_mask = - { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - - a = (__vector unsigned int) vec_abs ((__v4sf)__A); - b = (__vector unsigned int) vec_abs ((__v4sf)__B); - c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); - d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); - c = vec_and (c, d); + return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_ss(__m128 __A, __m128 __B) { + __vector unsigned int __a, __b; + __vector unsigned int __c, __d; + static const __vector unsigned int __float_exp_mask = { + 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + + __a = (__vector unsigned int)vec_abs((__v4sf)__A); + __b = (__vector unsigned int)vec_abs((__v4sf)__B); + __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a); + __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b); + __c = vec_and(__c, __d); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); -} - -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpunord_ss (__m128 __A, __m128 __B) -{ - __vector unsigned int a, b; - __vector unsigned int c, d; - static const __vector unsigned int float_exp_mask = - { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; - static const __vector unsigned int mask = - { 0xffffffff, 0, 0, 0 }; - - a = (__vector unsigned int) vec_abs ((__v4sf)__A); - b = (__vector unsigned int) vec_abs ((__v4sf)__B); - c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); - d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); - c = vec_or (c, d); + return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask)); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_ss(__m128 __A, __m128 __B) { + __vector unsigned int __a, __b; + __vector unsigned int __c, __d; + static const __vector unsigned int __float_exp_mask = { + 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; + + __a = (__vector unsigned int)vec_abs((__v4sf)__A); + __b = (__vector unsigned int)vec_abs((__v4sf)__B); + __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask); + __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask); + __c = vec_or(__c, __d); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); + return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask)); } /* Compare the lower SPFP values of A and B and return 1 if true and 0 if false. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comieq_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comieq_ss(__m128 __A, __m128 __B) { return (__A[0] == __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comilt_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comilt_ss(__m128 __A, __m128 __B) { return (__A[0] < __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comile_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comile_ss(__m128 __A, __m128 __B) { return (__A[0] <= __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comigt_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comigt_ss(__m128 __A, __m128 __B) { return (__A[0] > __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comige_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comige_ss(__m128 __A, __m128 __B) { return (__A[0] >= __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comineq_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comineq_ss(__m128 __A, __m128 __B) { return (__A[0] != __B[0]); } @@ -834,56 +824,56 @@ _mm_comineq_ss (__m128 __A, __m128 __B) * compare and signal for QNaNs. * The __mm_ucomieq_sd et all should be OK, as is. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomieq_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomieq_ss(__m128 __A, __m128 __B) { return (__A[0] == __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomilt_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomilt_ss(__m128 __A, __m128 __B) { return (__A[0] < __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomile_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomile_ss(__m128 __A, __m128 __B) { return (__A[0] <= __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomigt_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomigt_ss(__m128 __A, __m128 __B) { return (__A[0] > __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomige_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomige_ss(__m128 __A, __m128 __B) { return (__A[0] >= __B[0]); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_ucomineq_ss (__m128 __A, __m128 __B) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomineq_ss(__m128 __A, __m128 __B) { return (__A[0] != __B[0]); } -extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_f32 (__m128 __A) -{ +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_f32(__m128 __A) { return ((__v4sf)__A)[0]; } /* Convert the lower SPFP value to a 32-bit integer according to the current rounding mode. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_si32 (__m128 __A) -{ - __m64 res = 0; +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si32(__m128 __A) { + int __res; #ifdef _ARCH_PWR8 - double dtmp; + double __dtmp; __asm__( #ifdef __LITTLE_ENDIAN__ "xxsldwi %x0,%x0,%x0,3;\n" @@ -891,32 +881,30 @@ _mm_cvtss_si32 (__m128 __A) "xscvspdp %x2,%x0;\n" "fctiw %2,%2;\n" "mfvsrd %1,%x2;\n" - : "+wa" (__A), - "=r" (res), - "=f" (dtmp) - : ); + : "+wa"(__A), "=r"(__res), "=f"(__dtmp) + :); #else - res = __builtin_rint(__A[0]); + __res = __builtin_rint(__A[0]); #endif - return (res); + return __res; } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_ss2si (__m128 __A) -{ - return _mm_cvtss_si32 (__A); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_ss2si(__m128 __A) { + return _mm_cvtss_si32(__A); } /* Convert the lower SPFP value to a 32-bit integer according to the current rounding mode. */ /* Intel intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_si64 (__m128 __A) -{ - __m64 res = 0; -#ifdef _ARCH_PWR8 - double dtmp; +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si64(__m128 __A) { + long long __res; +#if defined(_ARCH_PWR8) && defined(__powerpc64__) + double __dtmp; __asm__( #ifdef __LITTLE_ENDIAN__ "xxsldwi %x0,%x0,%x0,3;\n" @@ -924,26 +912,23 @@ _mm_cvtss_si64 (__m128 __A) "xscvspdp %x2,%x0;\n" "fctid %2,%2;\n" "mfvsrd %1,%x2;\n" - : "+wa" (__A), - "=r" (res), - "=f" (dtmp) - : ); + : "+wa"(__A), "=r"(__res), "=f"(__dtmp) + :); #else - res = __builtin_llrint(__A[0]); + __res = __builtin_llrint(__A[0]); #endif - return (res); + return __res; } /* Microsoft intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_si64x (__m128 __A) -{ - return _mm_cvtss_si64 ((__v4sf) __A); +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si64x(__m128 __A) { + return _mm_cvtss_si64((__v4sf)__A); } /* Constants for use with _mm_prefetch. */ -enum _mm_hint -{ +enum _mm_hint { /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ _MM_HINT_ET0 = 7, _MM_HINT_ET1 = 6, @@ -955,368 +940,365 @@ enum _mm_hint /* Loads one cache line from address P to a location "closer" to the processor. The selector I specifies the type of prefetch operation. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_prefetch (const void *__P, enum _mm_hint __I) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_prefetch(const void *__P, enum _mm_hint __I) { /* Current PowerPC will ignores the hint parameters. */ - __builtin_prefetch (__P); + __builtin_prefetch(__P); } /* Convert the two lower SPFP values to 32-bit integers according to the current rounding mode. Return the integers in packed form. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtps_pi32 (__m128 __A) -{ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi32(__m128 __A) { /* Splat two lower SPFP values to both halves. */ - __v4sf temp, rounded; - __vector unsigned long long result; + __v4sf __temp, __rounded; + __vector unsigned long long __result; /* Splat two lower SPFP values to both halves. */ - temp = (__v4sf) vec_splat ((__vector long long)__A, 0); - rounded = vec_rint(temp); - result = (__vector unsigned long long) vec_cts (rounded, 0); + __temp = (__v4sf)vec_splat((__vector long long)__A, 0); + __rounded = vec_rint(__temp); + __result = (__vector unsigned long long)vec_cts(__rounded, 0); - return (__m64) ((__vector long long) result)[0]; + return (__m64)((__vector long long)__result)[0]; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_ps2pi (__m128 __A) -{ - return _mm_cvtps_pi32 (__A); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_ps2pi(__m128 __A) { + return _mm_cvtps_pi32(__A); } /* Truncate the lower SPFP value to a 32-bit integer. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_si32 (__m128 __A) -{ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si32(__m128 __A) { /* Extract the lower float element. */ - float temp = __A[0]; + float __temp = __A[0]; /* truncate to 32-bit integer and return. */ - return temp; + return __temp; } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_ss2si (__m128 __A) -{ - return _mm_cvttss_si32 (__A); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_ss2si(__m128 __A) { + return _mm_cvttss_si32(__A); } /* Intel intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_si64 (__m128 __A) -{ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si64(__m128 __A) { /* Extract the lower float element. */ - float temp = __A[0]; + float __temp = __A[0]; /* truncate to 32-bit integer and return. */ - return temp; + return __temp; } /* Microsoft intrinsic. */ -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_si64x (__m128 __A) -{ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si64x(__m128 __A) { /* Extract the lower float element. */ - float temp = __A[0]; + float __temp = __A[0]; /* truncate to 32-bit integer and return. */ - return temp; + return __temp; } /* Truncate the two lower SPFP values to 32-bit integers. Return the integers in packed form. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttps_pi32 (__m128 __A) -{ - __v4sf temp; - __vector unsigned long long result; +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttps_pi32(__m128 __A) { + __v4sf __temp; + __vector unsigned long long __result; /* Splat two lower SPFP values to both halves. */ - temp = (__v4sf) vec_splat ((__vector long long)__A, 0); - result = (__vector unsigned long long) vec_cts (temp, 0); + __temp = (__v4sf)vec_splat((__vector long long)__A, 0); + __result = (__vector unsigned long long)vec_cts(__temp, 0); - return (__m64) ((__vector long long) result)[0]; + return (__m64)((__vector long long)__result)[0]; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_ps2pi (__m128 __A) -{ - return _mm_cvttps_pi32 (__A); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_ps2pi(__m128 __A) { + return _mm_cvttps_pi32(__A); } /* Convert B to a SPFP value and insert it as element zero in A. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi32_ss (__m128 __A, int __B) -{ - float temp = __B; - __A[0] = temp; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_ss(__m128 __A, int __B) { + float __temp = __B; + __A[0] = __temp; return __A; } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_si2ss (__m128 __A, int __B) -{ - return _mm_cvtsi32_ss (__A, __B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_si2ss(__m128 __A, int __B) { + return _mm_cvtsi32_ss(__A, __B); } /* Convert B to a SPFP value and insert it as element zero in A. */ /* Intel intrinsic. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi64_ss (__m128 __A, long long __B) -{ - float temp = __B; - __A[0] = temp; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_ss(__m128 __A, long long __B) { + float __temp = __B; + __A[0] = __temp; return __A; } /* Microsoft intrinsic. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsi64x_ss (__m128 __A, long long __B) -{ - return _mm_cvtsi64_ss (__A, __B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_ss(__m128 __A, long long __B) { + return _mm_cvtsi64_ss(__A, __B); } /* Convert the two 32-bit values in B to SPFP form and insert them as the two lower elements in A. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpi32_ps (__m128 __A, __m64 __B) -{ - __vector signed int vm1; - __vector float vf1; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi32_ps(__m128 __A, __m64 __B) { + __vector signed int __vm1; + __vector float __vf1; - vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B}; - vf1 = (__vector float) vec_ctf (vm1, 0); + __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B}; + __vf1 = (__vector float)vec_ctf(__vm1, 0); - return ((__m128) (__vector unsigned long long) - { ((__vector unsigned long long)vf1) [0], - ((__vector unsigned long long)__A) [1]}); + return ((__m128)(__vector unsigned long long){ + ((__vector unsigned long long)__vf1)[0], + ((__vector unsigned long long)__A)[1]}); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_pi2ps (__m128 __A, __m64 __B) -{ - return _mm_cvtpi32_ps (__A, __B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_pi2ps(__m128 __A, __m64 __B) { + return _mm_cvtpi32_ps(__A, __B); } /* Convert the four signed 16-bit values in A to SPFP form. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpi16_ps (__m64 __A) -{ - __vector signed short vs8; - __vector signed int vi4; - __vector float vf1; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi16_ps(__m64 __A) { + __vector signed short __vs8; + __vector signed int __vi4; + __vector float __vf1; - vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A }; - vi4 = vec_vupklsh (vs8); - vf1 = (__vector float) vec_ctf (vi4, 0); + __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A}; + __vi4 = vec_vupklsh(__vs8); + __vf1 = (__vector float)vec_ctf(__vi4, 0); - return (__m128) vf1; + return (__m128)__vf1; } /* Convert the four unsigned 16-bit values in A to SPFP form. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpu16_ps (__m64 __A) -{ - const __vector unsigned short zero = - { 0, 0, 0, 0, 0, 0, 0, 0 }; - __vector unsigned short vs8; - __vector unsigned int vi4; - __vector float vf1; - - vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; - vi4 = (__vector unsigned int) vec_mergel +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpu16_ps(__m64 __A) { + const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0}; + __vector unsigned short __vs8; + __vector unsigned int __vi4; + __vector float __vf1; + + __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A}; + __vi4 = (__vector unsigned int)vec_mergel #ifdef __LITTLE_ENDIAN__ - (vs8, zero); + (__vs8, __zero); #else - (zero, vs8); + (__zero, __vs8); #endif - vf1 = (__vector float) vec_ctf (vi4, 0); + __vf1 = (__vector float)vec_ctf(__vi4, 0); - return (__m128) vf1; + return (__m128)__vf1; } /* Convert the low four signed 8-bit values in A to SPFP form. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpi8_ps (__m64 __A) -{ - __vector signed char vc16; - __vector signed short vs8; - __vector signed int vi4; - __vector float vf1; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi8_ps(__m64 __A) { + __vector signed char __vc16; + __vector signed short __vs8; + __vector signed int __vi4; + __vector float __vf1; - vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A }; - vs8 = vec_vupkhsb (vc16); - vi4 = vec_vupkhsh (vs8); - vf1 = (__vector float) vec_ctf (vi4, 0); + __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A}; + __vs8 = vec_vupkhsb(__vc16); + __vi4 = vec_vupkhsh(__vs8); + __vf1 = (__vector float)vec_ctf(__vi4, 0); - return (__m128) vf1; + return (__m128)__vf1; } /* Convert the low four unsigned 8-bit values in A to SPFP form. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - -_mm_cvtpu8_ps (__m64 __A) -{ - const __vector unsigned char zero = - { 0, 0, 0, 0, 0, 0, 0, 0 }; - __vector unsigned char vc16; - __vector unsigned short vs8; - __vector unsigned int vi4; - __vector float vf1; - - vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + + _mm_cvtpu8_ps(__m64 __A) { + const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0}; + __vector unsigned char __vc16; + __vector unsigned short __vs8; + __vector unsigned int __vi4; + __vector float __vf1; + + __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A}; #ifdef __LITTLE_ENDIAN__ - vs8 = (__vector unsigned short) vec_mergel (vc16, zero); - vi4 = (__vector unsigned int) vec_mergeh (vs8, - (__vector unsigned short) zero); + __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero); + __vi4 = + (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero); #else - vs8 = (__vector unsigned short) vec_mergel (zero, vc16); - vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero, - vs8); + __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16); + __vi4 = + (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8); #endif - vf1 = (__vector float) vec_ctf (vi4, 0); + __vf1 = (__vector float)vec_ctf(__vi4, 0); - return (__m128) vf1; + return (__m128)__vf1; } /* Convert the four signed 32-bit values in A and B to SPFP form. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpi32x2_ps (__m64 __A, __m64 __B) -{ - __vector signed int vi4; - __vector float vf4; +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { + __vector signed int __vi4; + __vector float __vf4; - vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; - vf4 = (__vector float) vec_ctf (vi4, 0); - return (__m128) vf4; + __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B}; + __vf4 = (__vector float)vec_ctf(__vi4, 0); + return (__m128)__vf4; } /* Convert the four SPFP values in A to four signed 16-bit integers. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtps_pi16 (__m128 __A) -{ - __v4sf rounded; - __vector signed int temp; - __vector unsigned long long result; +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi16(__m128 __A) { + __v4sf __rounded; + __vector signed int __temp; + __vector unsigned long long __result; - rounded = vec_rint(__A); - temp = vec_cts (rounded, 0); - result = (__vector unsigned long long) vec_pack (temp, temp); + __rounded = vec_rint(__A); + __temp = vec_cts(__rounded, 0); + __result = (__vector unsigned long long)vec_pack(__temp, __temp); - return (__m64) ((__vector long long) result)[0]; + return (__m64)((__vector long long)__result)[0]; } /* Convert the four SPFP values in A to four signed 8-bit integers. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtps_pi8 (__m128 __A) -{ - __v4sf rounded; - __vector signed int tmp_i; - static const __vector signed int zero = {0, 0, 0, 0}; - __vector signed short tmp_s; - __vector signed char res_v; - - rounded = vec_rint(__A); - tmp_i = vec_cts (rounded, 0); - tmp_s = vec_pack (tmp_i, zero); - res_v = vec_pack (tmp_s, tmp_s); - return (__m64) ((__vector long long) res_v)[0]; +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi8(__m128 __A) { + __v4sf __rounded; + __vector signed int __tmp_i; + static const __vector signed int __zero = {0, 0, 0, 0}; + __vector signed short __tmp_s; + __vector signed char __res_v; + + __rounded = vec_rint(__A); + __tmp_i = vec_cts(__rounded, 0); + __tmp_s = vec_pack(__tmp_i, __zero); + __res_v = vec_pack(__tmp_s, __tmp_s); + return (__m64)((__vector long long)__res_v)[0]; } /* Selects four specific SPFP values from A and B based on MASK. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - -_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) -{ - unsigned long element_selector_10 = __mask & 0x03; - unsigned long element_selector_32 = (__mask >> 2) & 0x03; - unsigned long element_selector_54 = (__mask >> 4) & 0x03; - unsigned long element_selector_76 = (__mask >> 6) & 0x03; - static const unsigned int permute_selectors[4] = - { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + + _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { + unsigned long __element_selector_10 = __mask & 0x03; + unsigned long __element_selector_32 = (__mask >> 2) & 0x03; + unsigned long __element_selector_54 = (__mask >> 4) & 0x03; + unsigned long __element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned int __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C #else 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F #endif - }; - __vector unsigned int t; + }; + __vector unsigned int __t; - t[0] = permute_selectors[element_selector_10]; - t[1] = permute_selectors[element_selector_32]; - t[2] = permute_selectors[element_selector_54] + 0x10101010; - t[3] = permute_selectors[element_selector_76] + 0x10101010; - return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); + __t[0] = __permute_selectors[__element_selector_10]; + __t[1] = __permute_selectors[__element_selector_32]; + __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; + __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; + return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t); } /* Selects and interleaves the upper two SPFP values from A and B. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpackhi_ps (__m128 __A, __m128 __B) -{ - return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_ps(__m128 __A, __m128 __B) { + return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B); } /* Selects and interleaves the lower two SPFP values from A and B. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_unpacklo_ps (__m128 __A, __m128 __B) -{ - return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_ps(__m128 __A, __m128 __B) { + return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B); } /* Sets the upper two SPFP values with 64-bits of data loaded from P; the lower two values are passed through from A. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadh_pi (__m128 __A, __m64 const *__P) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadh_pi(__m128 __A, __m64 const *__P) { __vector unsigned long long __a = (__vector unsigned long long)__A; __vector unsigned long long __p = vec_splats(*__P); - __a [1] = __p [1]; + __a[1] = __p[1]; return (__m128)__a; } /* Stores the upper two SPFP values of A into P. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storeh_pi (__m64 *__P, __m128 __A) -{ - __vector unsigned long long __a = (__vector unsigned long long) __A; +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeh_pi(__m64 *__P, __m128 __A) { + __vector unsigned long long __a = (__vector unsigned long long)__A; *__P = __a[1]; } /* Moves the upper two values of B into the lower two values of A. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movehl_ps (__m128 __A, __m128 __B) -{ - return (__m128) vec_mergel ((__vector unsigned long long)__B, - (__vector unsigned long long)__A); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movehl_ps(__m128 __A, __m128 __B) { + return (__m128)vec_mergel((__vector unsigned long long)__B, + (__vector unsigned long long)__A); } /* Moves the lower two values of B into the upper two values of A. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movelh_ps (__m128 __A, __m128 __B) -{ - return (__m128) vec_mergeh ((__vector unsigned long long)__A, - (__vector unsigned long long)__B); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movelh_ps(__m128 __A, __m128 __B) { + return (__m128)vec_mergeh((__vector unsigned long long)__A, + (__vector unsigned long long)__B); } /* Sets the lower two SPFP values with 64-bits of data loaded from P; the upper two values are passed through from A. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadl_pi (__m128 __A, __m64 const *__P) -{ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadl_pi(__m128 __A, __m64 const *__P) { __vector unsigned long long __a = (__vector unsigned long long)__A; __vector unsigned long long __p = vec_splats(*__P); - __a [0] = __p [0]; + __a[0] = __p[0]; return (__m128)__a; } /* Stores the lower two SPFP values of A into P. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_storel_pi (__m64 *__P, __m128 __A) -{ - __vector unsigned long long __a = (__vector unsigned long long) __A; +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storel_pi(__m64 *__P, __m128 __A) { + __vector unsigned long long __a = (__vector unsigned long long)__A; *__P = __a[0]; } @@ -1325,453 +1307,456 @@ _mm_storel_pi (__m64 *__P, __m128 __A) /* Intrinsic functions that require PowerISA 2.07 minimum. */ /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movemask_ps (__m128 __A) -{ - __vector unsigned long long result; - static const __vector unsigned int perm_mask = - { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_ps(__m128 __A) { +#ifdef _ARCH_PWR10 + return vec_extractm((__vector unsigned int)__A); +#else + __vector unsigned long long __result; + static const __vector unsigned int __perm_mask = { #ifdef __LITTLE_ENDIAN__ - 0x00204060, 0x80808080, 0x80808080, 0x80808080 + 0x00204060, 0x80808080, 0x80808080, 0x80808080 #else 0x80808080, 0x80808080, 0x80808080, 0x00204060 #endif - }; + }; - result = ((__vector unsigned long long) - vec_vbpermq ((__vector unsigned char) __A, - (__vector unsigned char) perm_mask)); + __result = ((__vector unsigned long long)vec_vbpermq( + (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); #ifdef __LITTLE_ENDIAN__ - return result[1]; + return __result[1]; #else - return result[0]; + return __result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ /* Create a vector with all four elements equal to *P. */ -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load1_ps (float const *__P) -{ - return _mm_set1_ps (*__P); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load1_ps(float const *__P) { + return _mm_set1_ps(*__P); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_load_ps1 (float const *__P) -{ - return _mm_load1_ps (__P); +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ps1(float const *__P) { + return _mm_load1_ps(__P); } /* Extracts one of the four words of A. The selector N must be immediate. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_extract_pi16 (__m64 const __A, int const __N) -{ - unsigned int shiftr = __N & 3; +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_pi16(__m64 const __A, int const __N) { + unsigned int __shiftr = __N & 3; #ifdef __BIG_ENDIAN__ - shiftr = 3 - shiftr; + __shiftr = 3 - __shiftr; #endif - return ((__A >> (shiftr * 16)) & 0xffff); + return ((__A >> (__shiftr * 16)) & 0xffff); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pextrw (__m64 const __A, int const __N) -{ - return _mm_extract_pi16 (__A, __N); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pextrw(__m64 const __A, int const __N) { + return _mm_extract_pi16(__A, __N); } /* Inserts word D into one of four words of A. The selector N must be immediate. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) -{ - const int shiftl = (__N & 3) * 16; - const __m64 shiftD = (const __m64) __D << shiftl; - const __m64 mask = 0xffffUL << shiftl; - __m64 result = (__A & (~mask)) | (shiftD & mask); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_pi16(__m64 const __A, int const __D, int const __N) { + const int __shiftl = (__N & 3) * 16; + const __m64 __shiftD = (const __m64)__D << __shiftl; + const __m64 __mask = 0xffffUL << __shiftl; + __m64 __result = (__A & (~__mask)) | (__shiftD & __mask); - return (result); + return __result; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pinsrw (__m64 const __A, int const __D, int const __N) -{ - return _mm_insert_pi16 (__A, __D, __N); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pinsrw(__m64 const __A, int const __D, int const __N) { + return _mm_insert_pi16(__A, __D, __N); } /* Compute the element-wise maximum of signed 16-bit values. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_pi16 (__m64 __A, __m64 __B) -{ + _mm_max_pi16(__m64 __A, __m64 __B) { #if _ARCH_PWR8 - __vector signed short a, b, r; - __vector __bool short c; - - a = (__vector signed short)vec_splats (__A); - b = (__vector signed short)vec_splats (__B); - c = (__vector __bool short)vec_cmpgt (a, b); - r = vec_sel (b, a, c); - return (__m64) ((__vector long long) r)[0]; + __vector signed short __a, __b, __r; + __vector __bool short __c; + + __a = (__vector signed short)vec_splats(__A); + __b = (__vector signed short)vec_splats(__B); + __c = (__vector __bool short)vec_cmpgt(__a, __b); + __r = vec_sel(__b, __a, __c); + return (__m64)((__vector long long)__r)[0]; #else - __m64_union m1, m2, res; + __m64_union __m1, __m2, __res; - m1.as_m64 = __A; - m2.as_m64 = __B; + __m1.as_m64 = __A; + __m2.as_m64 = __B; - res.as_short[0] = - (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; - res.as_short[1] = - (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; - res.as_short[2] = - (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; - res.as_short[3] = - (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; + __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0] + : __m2.as_short[0]; + __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1] + : __m2.as_short[1]; + __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2] + : __m2.as_short[2]; + __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3] + : __m2.as_short[3]; - return (__m64) res.as_m64; + return (__m64)__res.as_m64; #endif } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pmaxsw (__m64 __A, __m64 __B) -{ - return _mm_max_pi16 (__A, __B); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmaxsw(__m64 __A, __m64 __B) { + return _mm_max_pi16(__A, __B); } /* Compute the element-wise maximum of unsigned 8-bit values. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_pu8 (__m64 __A, __m64 __B) -{ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_pu8(__m64 __A, __m64 __B) { #if _ARCH_PWR8 - __vector unsigned char a, b, r; - __vector __bool char c; - - a = (__vector unsigned char)vec_splats (__A); - b = (__vector unsigned char)vec_splats (__B); - c = (__vector __bool char)vec_cmpgt (a, b); - r = vec_sel (b, a, c); - return (__m64) ((__vector long long) r)[0]; + __vector unsigned char __a, __b, __r; + __vector __bool char __c; + + __a = (__vector unsigned char)vec_splats(__A); + __b = (__vector unsigned char)vec_splats(__B); + __c = (__vector __bool char)vec_cmpgt(__a, __b); + __r = vec_sel(__b, __a, __c); + return (__m64)((__vector long long)__r)[0]; #else - __m64_union m1, m2, res; - long i; - - m1.as_m64 = __A; - m2.as_m64 = __B; + __m64_union __m1, __m2, __res; + long __i; + __m1.as_m64 = __A; + __m2.as_m64 = __B; - for (i = 0; i < 8; i++) - res.as_char[i] = - ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ? - m1.as_char[i] : m2.as_char[i]; + for (__i = 0; __i < 8; __i++) + __res.as_char[__i] = + ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i]) + ? __m1.as_char[__i] + : __m2.as_char[__i]; - return (__m64) res.as_m64; + return (__m64)__res.as_m64; #endif } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pmaxub (__m64 __A, __m64 __B) -{ - return _mm_max_pu8 (__A, __B); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmaxub(__m64 __A, __m64 __B) { + return _mm_max_pu8(__A, __B); } /* Compute the element-wise minimum of signed 16-bit values. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_pi16 (__m64 __A, __m64 __B) -{ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_pi16(__m64 __A, __m64 __B) { #if _ARCH_PWR8 - __vector signed short a, b, r; - __vector __bool short c; - - a = (__vector signed short)vec_splats (__A); - b = (__vector signed short)vec_splats (__B); - c = (__vector __bool short)vec_cmplt (a, b); - r = vec_sel (b, a, c); - return (__m64) ((__vector long long) r)[0]; + __vector signed short __a, __b, __r; + __vector __bool short __c; + + __a = (__vector signed short)vec_splats(__A); + __b = (__vector signed short)vec_splats(__B); + __c = (__vector __bool short)vec_cmplt(__a, __b); + __r = vec_sel(__b, __a, __c); + return (__m64)((__vector long long)__r)[0]; #else - __m64_union m1, m2, res; + __m64_union __m1, __m2, __res; - m1.as_m64 = __A; - m2.as_m64 = __B; + __m1.as_m64 = __A; + __m2.as_m64 = __B; - res.as_short[0] = - (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; - res.as_short[1] = - (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; - res.as_short[2] = - (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; - res.as_short[3] = - (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; + __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0] + : __m2.as_short[0]; + __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1] + : __m2.as_short[1]; + __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2] + : __m2.as_short[2]; + __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3] + : __m2.as_short[3]; - return (__m64) res.as_m64; + return (__m64)__res.as_m64; #endif } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pminsw (__m64 __A, __m64 __B) -{ - return _mm_min_pi16 (__A, __B); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pminsw(__m64 __A, __m64 __B) { + return _mm_min_pi16(__A, __B); } /* Compute the element-wise minimum of unsigned 8-bit values. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_pu8 (__m64 __A, __m64 __B) -{ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_pu8(__m64 __A, __m64 __B) { #if _ARCH_PWR8 - __vector unsigned char a, b, r; - __vector __bool char c; - - a = (__vector unsigned char)vec_splats (__A); - b = (__vector unsigned char)vec_splats (__B); - c = (__vector __bool char)vec_cmplt (a, b); - r = vec_sel (b, a, c); - return (__m64) ((__vector long long) r)[0]; + __vector unsigned char __a, __b, __r; + __vector __bool char __c; + + __a = (__vector unsigned char)vec_splats(__A); + __b = (__vector unsigned char)vec_splats(__B); + __c = (__vector __bool char)vec_cmplt(__a, __b); + __r = vec_sel(__b, __a, __c); + return (__m64)((__vector long long)__r)[0]; #else - __m64_union m1, m2, res; - long i; + __m64_union __m1, __m2, __res; + long __i; - m1.as_m64 = __A; - m2.as_m64 = __B; + __m1.as_m64 = __A; + __m2.as_m64 = __B; + for (__i = 0; __i < 8; __i++) + __res.as_char[__i] = + ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i]) + ? __m1.as_char[__i] + : __m2.as_char[__i]; - for (i = 0; i < 8; i++) - res.as_char[i] = - ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ? - m1.as_char[i] : m2.as_char[i]; - - return (__m64) res.as_m64; + return (__m64)__res.as_m64; #endif } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pminub (__m64 __A, __m64 __B) -{ - return _mm_min_pu8 (__A, __B); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pminub(__m64 __A, __m64 __B) { + return _mm_min_pu8(__A, __B); } /* Create an 8-bit mask of the signs of 8-bit values. */ -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_movemask_pi8 (__m64 __A) -{ - unsigned long long p = +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_pi8(__m64 __A) { +#ifdef __powerpc64__ + unsigned long long __p = +#ifdef __LITTLE_ENDIAN__ + 0x0008101820283038UL; // permute control for sign bits +#else + 0x3830282018100800UL; // permute control for sign bits +#endif + return __builtin_bpermd(__p, __A); +#else #ifdef __LITTLE_ENDIAN__ - 0x0008101820283038UL; // permute control for sign bits + unsigned int __mask = 0x20283038UL; + unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf; + unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf; #else - 0x3830282018100800UL; // permute control for sign bits + unsigned int __mask = 0x38302820UL; + unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf; + unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf; +#endif + return (__r2 << 4) | __r1; #endif - return __builtin_bpermd (p, __A); } -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pmovmskb (__m64 __A) -{ - return _mm_movemask_pi8 (__A); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmovmskb(__m64 __A) { + return _mm_movemask_pi8(__A); } /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values in B and produce the high 16 bits of the 32-bit results. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mulhi_pu16 (__m64 __A, __m64 __B) -{ - __vector unsigned short a, b; - __vector unsigned short c; - __vector unsigned int w0, w1; - __vector unsigned char xform1 = { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_pu16(__m64 __A, __m64 __B) { + __vector unsigned short __a, __b; + __vector unsigned short __c; + __vector unsigned int __w0, __w1; + __vector unsigned char __xform1 = { #ifdef __LITTLE_ENDIAN__ - 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, - 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F + 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, + 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F #else - 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, - 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, + 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 #endif - }; + }; - a = (__vector unsigned short)vec_splats (__A); - b = (__vector unsigned short)vec_splats (__B); + __a = (__vector unsigned short)vec_splats(__A); + __b = (__vector unsigned short)vec_splats(__B); - w0 = vec_vmuleuh (a, b); - w1 = vec_vmulouh (a, b); - c = (__vector unsigned short)vec_perm (w0, w1, xform1); + __w0 = vec_vmuleuh(__a, __b); + __w1 = vec_vmulouh(__a, __b); + __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1); - return (__m64) ((__vector long long) c)[0]; + return (__m64)((__vector long long)__c)[0]; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pmulhuw (__m64 __A, __m64 __B) -{ - return _mm_mulhi_pu16 (__A, __B); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmulhuw(__m64 __A, __m64 __B) { + return _mm_mulhi_pu16(__A, __B); } /* Return a combination of the four 16-bit values in A. The selector must be an immediate. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_shuffle_pi16 (__m64 __A, int const __N) -{ - unsigned long element_selector_10 = __N & 0x03; - unsigned long element_selector_32 = (__N >> 2) & 0x03; - unsigned long element_selector_54 = (__N >> 4) & 0x03; - unsigned long element_selector_76 = (__N >> 6) & 0x03; - static const unsigned short permute_selectors[4] = - { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_pi16(__m64 __A, int const __N) { + unsigned long __element_selector_10 = __N & 0x03; + unsigned long __element_selector_32 = (__N >> 2) & 0x03; + unsigned long __element_selector_54 = (__N >> 4) & 0x03; + unsigned long __element_selector_76 = (__N >> 6) & 0x03; + static const unsigned short __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ - 0x0908, 0x0B0A, 0x0D0C, 0x0F0E + 0x0908, 0x0B0A, 0x0D0C, 0x0F0E #else - 0x0607, 0x0405, 0x0203, 0x0001 + 0x0607, 0x0405, 0x0203, 0x0001 #endif - }; - __m64_union t; - __vector unsigned long long a, p, r; + }; + __m64_union __t; + __vector unsigned long long __a, __p, __r; #ifdef __LITTLE_ENDIAN__ - t.as_short[0] = permute_selectors[element_selector_10]; - t.as_short[1] = permute_selectors[element_selector_32]; - t.as_short[2] = permute_selectors[element_selector_54]; - t.as_short[3] = permute_selectors[element_selector_76]; + __t.as_short[0] = __permute_selectors[__element_selector_10]; + __t.as_short[1] = __permute_selectors[__element_selector_32]; + __t.as_short[2] = __permute_selectors[__element_selector_54]; + __t.as_short[3] = __permute_selectors[__element_selector_76]; #else - t.as_short[3] = permute_selectors[element_selector_10]; - t.as_short[2] = permute_selectors[element_selector_32]; - t.as_short[1] = permute_selectors[element_selector_54]; - t.as_short[0] = permute_selectors[element_selector_76]; + __t.as_short[3] = __permute_selectors[__element_selector_10]; + __t.as_short[2] = __permute_selectors[__element_selector_32]; + __t.as_short[1] = __permute_selectors[__element_selector_54]; + __t.as_short[0] = __permute_selectors[__element_selector_76]; #endif - p = vec_splats (t.as_m64); - a = vec_splats (__A); - r = vec_perm (a, a, (__vector unsigned char)p); - return (__m64) ((__vector long long) r)[0]; + __p = vec_splats(__t.as_m64); + __a = vec_splats(__A); + __r = vec_perm(__a, __a, (__vector unsigned char)__p); + return (__m64)((__vector long long)__r)[0]; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pshufw (__m64 __A, int const __N) -{ - return _mm_shuffle_pi16 (__A, __N); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pshufw(__m64 __A, int const __N) { + return _mm_shuffle_pi16(__A, __N); } /* Conditionally store byte elements of A into P. The high bit of each byte in the selector N determines whether the corresponding byte from A is stored. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) -{ - __m64 hibit = 0x8080808080808080UL; - __m64 mask, tmp; - __m64 *p = (__m64*)__P; +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) { + __m64 __hibit = 0x8080808080808080UL; + __m64 __mask, __tmp; + __m64 *__p = (__m64 *)__P; - tmp = *p; - mask = _mm_cmpeq_pi8 ((__N & hibit), hibit); - tmp = (tmp & (~mask)) | (__A & mask); - *p = tmp; + __tmp = *__p; + __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit); + __tmp = (__tmp & (~__mask)) | (__A & __mask); + *__p = __tmp; } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_maskmovq (__m64 __A, __m64 __N, char *__P) -{ - _mm_maskmove_si64 (__A, __N, __P); +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_maskmovq(__m64 __A, __m64 __N, char *__P) { + _mm_maskmove_si64(__A, __N, __P); } /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_avg_pu8 (__m64 __A, __m64 __B) -{ - __vector unsigned char a, b, c; +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_pu8(__m64 __A, __m64 __B) { + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats (__A); - b = (__vector unsigned char)vec_splats (__B); - c = vec_avg (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned char)vec_splats(__A); + __b = (__vector unsigned char)vec_splats(__B); + __c = vec_avg(__a, __b); + return (__m64)((__vector long long)__c)[0]; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pavgb (__m64 __A, __m64 __B) -{ - return _mm_avg_pu8 (__A, __B); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pavgb(__m64 __A, __m64 __B) { + return _mm_avg_pu8(__A, __B); } /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_avg_pu16 (__m64 __A, __m64 __B) -{ - __vector unsigned short a, b, c; +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_pu16(__m64 __A, __m64 __B) { + __vector unsigned short __a, __b, __c; - a = (__vector unsigned short)vec_splats (__A); - b = (__vector unsigned short)vec_splats (__B); - c = vec_avg (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned short)vec_splats(__A); + __b = (__vector unsigned short)vec_splats(__B); + __c = vec_avg(__a, __b); + return (__m64)((__vector long long)__c)[0]; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_pavgw (__m64 __A, __m64 __B) -{ - return _mm_avg_pu16 (__A, __B); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pavgw(__m64 __A, __m64 __B) { + return _mm_avg_pu16(__A, __B); } /* Compute the sum of the absolute differences of the unsigned 8-bit values in A and B. Return the value in the lower 16-bit word; the upper words are cleared. */ -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sad_pu8 (__m64 __A, __m64 __B) -{ - __vector unsigned char a, b; - __vector unsigned char vmin, vmax, vabsdiff; - __vector signed int vsum; - const __vector unsigned int zero = - { 0, 0, 0, 0 }; - __m64_union result = {0}; - - a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; - b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; - vmin = vec_min (a, b); - vmax = vec_max (a, b); - vabsdiff = vec_sub (vmax, vmin); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sad_pu8(__m64 __A, __m64 __B) { + __vector unsigned char __a, __b; + __vector unsigned char __vmin, __vmax, __vabsdiff; + __vector signed int __vsum; + const __vector unsigned int __zero = {0, 0, 0, 0}; + __m64_union __result = {0}; + + __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A}; + __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B}; + __vmin = vec_min(__a, __b); + __vmax = vec_max(__a, __b); + __vabsdiff = vec_sub(__vmax, __vmin); /* Sum four groups of bytes into integers. */ - vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); + __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero); /* Sum across four integers with integer result. */ - vsum = vec_sums (vsum, (__vector signed int) zero); + __vsum = vec_sums(__vsum, (__vector signed int)__zero); /* The sum is in the right most 32-bits of the vector result. Transfer to a GPR and truncate to 16 bits. */ - result.as_short[0] = vsum[3]; - return result.as_m64; + __result.as_short[0] = __vsum[3]; + return __result.as_m64; } -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_m_psadbw (__m64 __A, __m64 __B) -{ - return _mm_sad_pu8 (__A, __B); +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psadbw(__m64 __A, __m64 __B) { + return _mm_sad_pu8(__A, __B); } /* Stores the data in A to the address P without polluting the caches. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_stream_pi (__m64 *__P, __m64 __A) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_pi(__m64 *__P, __m64 __A) { /* Use the data cache block touch for store transient. */ - __asm__ ( - " dcbtstt 0,%0" - : - : "b" (__P) - : "memory" - ); + __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory"); *__P = __A; } /* Likewise. The address must be 16-byte aligned. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_stream_ps (float *__P, __m128 __A) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_ps(float *__P, __m128 __A) { /* Use the data cache block touch for store transient. */ - __asm__ ( - " dcbtstt 0,%0" - : - : "b" (__P) - : "memory" - ); - _mm_store_ps (__P, __A); + __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory"); + _mm_store_ps(__P, __A); } /* Guarantees that every preceding store is globally visible before any subsequent store. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sfence (void) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sfence(void) { /* Generate a light weight sync. */ - __atomic_thread_fence (__ATOMIC_RELEASE); + __atomic_thread_fence(__ATOMIC_RELEASE); } /* The execution of the next instruction is delayed by an implementation @@ -1779,9 +1764,9 @@ _mm_sfence (void) architectural state. This is after the pop_options pragma because it does not require SSE support in the processor--the encoding is a nop on processors that do not support it. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_pause (void) -{ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_pause(void) { /* There is no exact match with this construct, but the following is close to the desired effect. */ #if _ARCH_PWR8 @@ -1797,47 +1782,46 @@ _mm_pause (void) PRI and continue execution. */ unsigned long __PPR; - __asm__ volatile ( - " mfppr %0;" - " or 31,31,31;" - " isync;" - " lwsync;" - " isync;" - " mtppr %0;" - : "=r" (__PPR) - : - : "memory" - ); + __asm__ volatile(" mfppr %0;" + " or 31,31,31;" + " isync;" + " lwsync;" + " isync;" + " mtppr %0;" + : "=r"(__PPR) + : + : "memory"); #else /* For older processor where we may not even have Program Priority controls we can only depend on Heavy Weight Sync. */ - __atomic_thread_fence (__ATOMIC_SEQ_CST); + __atomic_thread_fence(__ATOMIC_SEQ_CST); #endif } /* Transpose the 4x4 matrix composed of row[0-3]. */ -#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ -do { \ - __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ - __v4sf __t0 = vec_vmrghw (__r0, __r1); \ - __v4sf __t1 = vec_vmrghw (__r2, __r3); \ - __v4sf __t2 = vec_vmrglw (__r0, __r1); \ - __v4sf __t3 = vec_vmrglw (__r2, __r3); \ - (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \ - (__vector long long)__t1); \ - (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \ - (__vector long long)__t1); \ - (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \ - (__vector long long)__t3); \ - (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \ - (__vector long long)__t3); \ -} while (0) +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ + __v4sf __t0 = vec_vmrghw(__r0, __r1); \ + __v4sf __t1 = vec_vmrghw(__r2, __r3); \ + __v4sf __t2 = vec_vmrglw(__r0, __r1); \ + __v4sf __t3 = vec_vmrglw(__r2, __r3); \ + (row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \ + (__vector long long)__t1); \ + (row1) = (__v4sf)vec_mergel((__vector long long)__t0, \ + (__vector long long)__t1); \ + (row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \ + (__vector long long)__t3); \ + (row3) = (__v4sf)vec_mergel((__vector long long)__t2, \ + (__vector long long)__t3); \ + } while (0) /* For backward source compatibility. */ //# include <emmintrin.h> #else #include_next <xmmintrin.h> -#endif /* defined(__linux__) && defined(__ppc64__) */ +#endif /* defined(__powerpc64__) && \ + * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ -#endif /* _XMMINTRIN_H_INCLUDED */ +#endif /* XMMINTRIN_H_ */ |