src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2017-04-16 16:02:28 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-04-16 16:02:28 +0000
commit	7442d6faa2719e4e7d33a7021c406c5a4facd74d (patch)
tree	c72b9241553fc9966179aba84f90f17bfa9235c3 /lib/Headers
parent	b52119637f743680a99710ce5fdb6646da2772af (diff)
download	src-7442d6faa2719e4e7d33a7021c406c5a4facd74d.tar.gz src-7442d6faa2719e4e7d33a7021c406c5a4facd74d.zip

Vendor import of clang trunk r300422:vendor/clang/clang-trunk-r300422

https://llvm.org/svn/llvm-project/cfe/trunk@300422

Notes

Notes: svn path=/vendor/clang/dist/; revision=317019 svn path=/vendor/clang/clang-trunk-r300422/; revision=317020; tag=vendor/clang/clang-trunk-r300422

Diffstat (limited to 'lib/Headers')

-rw-r--r--

lib/Headers/CMakeLists.txt

-rw-r--r--

lib/Headers/altivec.h

-rw-r--r--

lib/Headers/avx2intrin.h

-rw-r--r--

lib/Headers/avx512bwintrin.h

104

-rw-r--r--

lib/Headers/avx512dqintrin.h

-rw-r--r--

lib/Headers/avx512fintrin.h

189

-rw-r--r--

lib/Headers/avx512vldqintrin.h

-rw-r--r--

lib/Headers/avx512vlintrin.h

-rw-r--r--

lib/Headers/avxintrin.h

296

-rw-r--r--

lib/Headers/clzerointrin.h

-rw-r--r--

lib/Headers/emmintrin.h

-rw-r--r--

lib/Headers/f16cintrin.h

-rw-r--r--

lib/Headers/htmxlintrin.h

-rw-r--r--

lib/Headers/intrin.h

-rw-r--r--

lib/Headers/mmintrin.h

-rw-r--r--

lib/Headers/module.modulemap

-rw-r--r--

lib/Headers/opencl-c.h

884

-rw-r--r--

lib/Headers/pmmintrin.h

-rw-r--r--

lib/Headers/prfchwintrin.h

-rw-r--r--

lib/Headers/smmintrin.h

2013

-rw-r--r--

lib/Headers/stdarg.h

-rw-r--r--

lib/Headers/tgmath.h

-rw-r--r--

lib/Headers/x86intrin.h

-rw-r--r--

lib/Headers/xmmintrin.h

-rw-r--r--

lib/Headers/xopintrin.h

25 files changed, 2768 insertions, 1189 deletions

diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt
index efc4dd0971b6..35aff4017e93 100644
--- a/lib/Headers/CMakeLists.txt
+++ b/lib/Headers/CMakeLists.txt

@@ -28,6 +28,7 @@ set(files

__clang_cuda_intrinsics.h

__clang_cuda_math_forward_declares.h

__clang_cuda_runtime_wrapper.h

+ clzerointrin.h

cpuid.h

clflushoptintrin.h

emmintrin.h

diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index a01d9d837ad1..421e2a7754a5 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h

@@ -8045,45 +8045,51 @@ static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,

/* vec_sl */

-static __inline__ vector signed char __ATTRS_o_ai

-vec_sl(vector signed char __a, vector unsigned char __b) {

- return __a << (vector signed char)__b;

+// vec_sl does modulo arithmetic on __b first, so __b is allowed to be more

+// than the length of __a.

static __inline__ vector unsigned char __ATTRS_o_ai

vec_sl(vector unsigned char __a, vector unsigned char __b) {

- return __a << __b;

+ return __a << (__b %

+ (vector unsigned char)(sizeof(unsigned char) * __CHAR_BIT__));

}

-static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,

- vector unsigned short __b) {

- return __a << (vector short)__b;

+static __inline__ vector signed char __ATTRS_o_ai

+vec_sl(vector signed char __a, vector unsigned char __b) {

+ return (vector signed char)vec_sl((vector unsigned char)__a, __b);

}

static __inline__ vector unsigned short __ATTRS_o_ai

vec_sl(vector unsigned short __a, vector unsigned short __b) {

- return __a << __b;

+ return __a << (__b % (vector unsigned short)(sizeof(unsigned short) *

+ __CHAR_BIT__));

}

-static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,

- vector unsigned int __b) {

- return __a << (vector int)__b;

+static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,

+ vector unsigned short __b) {

+ return (vector short)vec_sl((vector unsigned short)__a, __b);

}

static __inline__ vector unsigned int __ATTRS_o_ai

vec_sl(vector unsigned int __a, vector unsigned int __b) {

- return __a << __b;

+ return __a << (__b %

+ (vector unsigned int)(sizeof(unsigned int) * __CHAR_BIT__));

}

-#ifdef __POWER8_VECTOR__

-static __inline__ vector signed long long __ATTRS_o_ai

-vec_sl(vector signed long long __a, vector unsigned long long __b) {

- return __a << (vector long long)__b;

+static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,

+ vector unsigned int __b) {

+ return (vector int)vec_sl((vector unsigned int)__a, __b);

}

+#ifdef __POWER8_VECTOR__

static __inline__ vector unsigned long long __ATTRS_o_ai

vec_sl(vector unsigned long long __a, vector unsigned long long __b) {

- return __a << __b;

+ return __a << (__b % (vector unsigned long long)(sizeof(unsigned long long) *

+ __CHAR_BIT__));

+static __inline__ vector long long __ATTRS_o_ai

+vec_sl(vector long long __a, vector unsigned long long __b) {

+ return (vector long long)vec_sl((vector unsigned long long)__a, __b);

}

#endif

diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h
index 13bcbef4dbbe..5d83a8db484b 100644
--- a/lib/Headers/avx2intrin.h
+++ b/lib/Headers/avx2intrin.h

@@ -832,7 +832,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)

static __inline__ __m256i __DEFAULT_FN_ATTRS

_mm256_stream_load_si256(__m256i const *__V)

{

- return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);

+ return (__m256i)__builtin_nontemporal_load((const __v4di *)__V);

}

static __inline__ __m128 __DEFAULT_FN_ATTRS

diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h
index 629dc8611a7f..41958b7214e2 100644
--- a/lib/Headers/avx512bwintrin.h
+++ b/lib/Headers/avx512bwintrin.h

@@ -504,115 +504,91 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_packs_epi32 (__m512i __A, __m512i __B)

+_mm512_packs_epi32(__m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,

- (__v16si) __B,

- (__v32hi) _mm512_setzero_hi(),

- (__mmask32) -1);

+ return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)

+_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,

- (__v16si) __B,

- (__v32hi) _mm512_setzero_hi(),

- __M);

+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,

+ (__v32hi)_mm512_packs_epi32(__A, __B),

+ (__v32hi)_mm512_setzero_hi());

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,

- __m512i __B)

+_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,

- (__v16si) __B,

- (__v32hi) __W,

- __M);

+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,

+ (__v32hi)_mm512_packs_epi32(__A, __B),

+ (__v32hi)__W);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_packs_epi16 (__m512i __A, __m512i __B)

+_mm512_packs_epi16(__m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,

- (__v32hi) __B,

- (__v64qi) _mm512_setzero_qi(),

- (__mmask64) -1);

+ return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,

- __m512i __B)

+_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,

- (__v32hi) __B,

- (__v64qi) __W,

- (__mmask64) __M);

+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,

+ (__v64qi)_mm512_packs_epi16(__A, __B),

+ (__v64qi)__W);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)

+_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,

- (__v32hi) __B,

- (__v64qi) _mm512_setzero_qi(),

- __M);

+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,

+ (__v64qi)_mm512_packs_epi16(__A, __B),

+ (__v64qi)_mm512_setzero_qi());

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_packus_epi32 (__m512i __A, __m512i __B)

+_mm512_packus_epi32(__m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,

- (__v16si) __B,

- (__v32hi) _mm512_setzero_hi(),

- (__mmask32) -1);

+ return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)

+_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,

- (__v16si) __B,

- (__v32hi) _mm512_setzero_hi(),

- __M);

+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,

+ (__v32hi)_mm512_packus_epi32(__A, __B),

+ (__v32hi)_mm512_setzero_hi());

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,

- __m512i __B)

+_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,

- (__v16si) __B,

- (__v32hi) __W,

- __M);

+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,

+ (__v32hi)_mm512_packus_epi32(__A, __B),

+ (__v32hi)__W);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_packus_epi16 (__m512i __A, __m512i __B)

+_mm512_packus_epi16(__m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,

- (__v32hi) __B,

- (__v64qi) _mm512_setzero_qi(),

- (__mmask64) -1);

+ return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,

- __m512i __B)

+_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,

- (__v32hi) __B,

- (__v64qi) __W,

- (__mmask64) __M);

+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,

+ (__v64qi)_mm512_packus_epi16(__A, __B),

+ (__v64qi)__W);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)

+_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)

{

- return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,

- (__v32hi) __B,

- (__v64qi) _mm512_setzero_qi(),

- (__mmask64) __M);

+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,

+ (__v64qi)_mm512_packus_epi16(__A, __B),

+ (__v64qi)_mm512_setzero_qi());

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

diff --git a/lib/Headers/avx512dqintrin.h b/lib/Headers/avx512dqintrin.h
index ae44b98a9495..4fd1add7735b 100644
--- a/lib/Headers/avx512dqintrin.h
+++ b/lib/Headers/avx512dqintrin.h

@@ -995,51 +995,50 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)

}

static __inline__ __m512 __DEFAULT_FN_ATTRS

-_mm512_broadcast_f32x8 (__m256 __A)

+_mm512_broadcast_f32x8(__m256 __A)

{

- return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,

- _mm512_undefined_ps(),

- (__mmask16) -1);

+ return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,

+ 0, 1, 2, 3, 4, 5, 6, 7,

+ 0, 1, 2, 3, 4, 5, 6, 7);

}

static __inline__ __m512 __DEFAULT_FN_ATTRS

-_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)

+_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)

{

- return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,

- (__v16sf)__O,

- __M);

+ return (__m512)__builtin_ia32_selectps_512((__mmask8)__M,

+ (__v16sf)_mm512_broadcast_f32x8(__A),

+ (__v16sf)__O);

}

static __inline__ __m512 __DEFAULT_FN_ATTRS

-_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)

+_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)

{

- return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,

- (__v16sf)_mm512_setzero_ps (),

- __M);

+ return (__m512)__builtin_ia32_selectps_512((__mmask8)__M,

+ (__v16sf)_mm512_broadcast_f32x8(__A),

+ (__v16sf)_mm512_setzero_ps());

}

static __inline__ __m512d __DEFAULT_FN_ATTRS

-_mm512_broadcast_f64x2 (__m128d __A)

+_mm512_broadcast_f64x2(__m128d __A)

{

- return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,

- (__v8df)_mm512_undefined_pd(),

- (__mmask8) -1);

+ return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,

+ 0, 1, 0, 1, 0, 1, 0, 1);

}

static __inline__ __m512d __DEFAULT_FN_ATTRS

-_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)

+_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)

{

- return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,

- (__v8df)

- __O, __M);

+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,

+ (__v8df)_mm512_broadcast_f64x2(__A),

+ (__v8df)__O);

}

static __inline__ __m512d __DEFAULT_FN_ATTRS

-_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)

+_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)

{

- return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,

- (__v8df)_mm512_setzero_ps (),

- __M);

+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,

+ (__v8df)_mm512_broadcast_f64x2(__A),

+ (__v8df)_mm512_setzero_pd());

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

@@ -1067,52 +1066,50 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_broadcast_i32x8 (__m256i __A)

+_mm512_broadcast_i32x8(__m256i __A)

{

- return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,

- (__v16si)_mm512_setzero_si512(),

- (__mmask16) -1);

+ return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,

+ 0, 1, 2, 3, 4, 5, 6, 7,

+ 0, 1, 2, 3, 4, 5, 6, 7);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)

+_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)

{

- return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,

- (__v16si)__O,

- __M);

+ return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M,

+ (__v16si)_mm512_broadcast_i32x8(__A),

+ (__v16si)__O);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)

+_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)

{

- return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,

- (__v16si)

- _mm512_setzero_si512 (),

- __M);

+ return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M,

+ (__v16si)_mm512_broadcast_i32x8(__A),

+ (__v16si)_mm512_setzero_si512());

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_broadcast_i64x2 (__m128i __A)

+_mm512_broadcast_i64x2(__m128i __A)

{

- return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,

- (__v8di)_mm512_setzero_si512(),

- (__mmask8) -1);

+ return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,

+ 0, 1, 0, 1, 0, 1, 0, 1);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)

+_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)

{

- return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,

- (__v8di)

- __O, __M);

+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,

+ (__v8di)_mm512_broadcast_i64x2(__A),

+ (__v8di)__O);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)

+_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)

{

- return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,

- (__v8di)_mm512_setzero_si512 (),

- __M);

+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,

+ (__v8di)_mm512_broadcast_i64x2(__A),

+ (__v8di)_mm512_setzero_si512());

}

#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \

diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index e6a7217c8967..d8535f765889 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h

@@ -4229,6 +4229,18 @@ _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)

_MM_FROUND_CUR_DIRECTION);

}

+static __inline__ double __DEFAULT_FN_ATTRS

+_mm512_cvtsd_f64(__m512d __a)

+ return __a[0];

+static __inline__ float __DEFAULT_FN_ATTRS

+_mm512_cvtss_f32(__m512 __a)

+ return __a[0];

/* Unpack and Interleave */

static __inline __m512d __DEFAULT_FN_ATTRS

@@ -4540,7 +4552,7 @@ _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)

}

static __inline __m512d __DEFAULT_FN_ATTRS

-_mm512_loadu_pd(double const *__p)

+_mm512_loadu_pd(void const *__p)

{

struct __loadu_pd {

__m512d __v;

@@ -4549,7 +4561,7 @@ _mm512_loadu_pd(double const *__p)

}

static __inline __m512 __DEFAULT_FN_ATTRS

-_mm512_loadu_ps(float const *__p)

+_mm512_loadu_ps(void const *__p)

{

struct __loadu_ps {

__m512 __v;

@@ -4558,7 +4570,7 @@ _mm512_loadu_ps(float const *__p)

}

static __inline __m512 __DEFAULT_FN_ATTRS

-_mm512_load_ps(float const *__p)

+_mm512_load_ps(void const *__p)

{

return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,

(__v16sf)

@@ -4584,7 +4596,7 @@ _mm512_maskz_load_ps(__mmask16 __U, void const *__P)

}

static __inline __m512d __DEFAULT_FN_ATTRS

-_mm512_load_pd(double const *__p)

+_mm512_load_pd(void const *__p)

{

return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,

(__v8df)

@@ -7278,107 +7290,97 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)

(__mmask8)(U), (int)(R)); })

static __inline__ __m512 __DEFAULT_FN_ATTRS

-_mm512_broadcast_f32x4 (__m128 __A)

+_mm512_broadcast_f32x4(__m128 __A)

{

- return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,

- (__v16sf)

- _mm512_undefined_ps (),

- (__mmask16) -1);

+ return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,

+ 0, 1, 2, 3, 0, 1, 2, 3,

+ 0, 1, 2, 3, 0, 1, 2, 3);

}

static __inline__ __m512 __DEFAULT_FN_ATTRS

-_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)

+_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)

{

- return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,

- (__v16sf) __O,

- __M);

+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,

+ (__v16sf)_mm512_broadcast_f32x4(__A),

+ (__v16sf)__O);

}

static __inline__ __m512 __DEFAULT_FN_ATTRS

-_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)

+_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)

{

- return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,

- (__v16sf)

- _mm512_setzero_ps (),

- __M);

+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,

+ (__v16sf)_mm512_broadcast_f32x4(__A),

+ (__v16sf)_mm512_setzero_ps());

}

static __inline__ __m512d __DEFAULT_FN_ATTRS

-_mm512_broadcast_f64x4 (__m256d __A)

+_mm512_broadcast_f64x4(__m256d __A)

{

- return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,

- (__v8df)

- _mm512_undefined_pd (),

- (__mmask8) -1);

+ return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,

+ 0, 1, 2, 3, 0, 1, 2, 3);

}

static __inline__ __m512d __DEFAULT_FN_ATTRS

-_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)

+_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)

{

- return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,

- (__v8df) __O,

- __M);

+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,

+ (__v8df)_mm512_broadcast_f64x4(__A),

+ (__v8df)__O);

}

static __inline__ __m512d __DEFAULT_FN_ATTRS

-_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)

+_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)

{

- return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,

- (__v8df)

- _mm512_setzero_pd (),

- __M);

+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,

+ (__v8df)_mm512_broadcast_f64x4(__A),

+ (__v8df)_mm512_setzero_pd());

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_broadcast_i32x4 (__m128i __A)

+_mm512_broadcast_i32x4(__m128i __A)

{

- return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,

- (__v16si)

- _mm512_undefined_epi32 (),

- (__mmask16) -1);

+ return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,

+ 0, 1, 2, 3, 0, 1, 2, 3,

+ 0, 1, 2, 3, 0, 1, 2, 3);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)

+_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)

{

- return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,

- (__v16si) __O,

- __M);

+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,

+ (__v16si)_mm512_broadcast_i32x4(__A),

+ (__v16si)__O);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)

+_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)

{

- return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,

- (__v16si)

- _mm512_setzero_si512 (),

- __M);

+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,

+ (__v16si)_mm512_broadcast_i32x4(__A),

+ (__v16si)_mm512_setzero_si512());

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_broadcast_i64x4 (__m256i __A)

+_mm512_broadcast_i64x4(__m256i __A)

{

- return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,

- (__v8di)

- _mm512_undefined_epi32 (),

- (__mmask8) -1);

+ return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,

+ 0, 1, 2, 3, 0, 1, 2, 3);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)

+_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)

{

- return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,

- (__v8di) __O,

- __M);

+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,

+ (__v8di)_mm512_broadcast_i64x4(__A),

+ (__v8di)__O);

}

static __inline__ __m512i __DEFAULT_FN_ATTRS

-_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)

+_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)

{

- return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,

- (__v8di)

- _mm512_setzero_si512 (),

- __M);

+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,

+ (__v8di)_mm512_broadcast_i64x4(__A),

+ (__v8di)_mm512_setzero_si512());

}

static __inline__ __m512d __DEFAULT_FN_ATTRS

@@ -7860,12 +7862,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)

3 + ((imm) & 0x3) * 4); })

#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \

- (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \

+ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \

(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \

- (__v4si)__W); })

+ (__v4si)(W)); })

#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \

- (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \

+ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \

(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \

(__v4si)_mm_setzero_si128()); })

@@ -7878,12 +7880,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)

((imm) & 1) ? 7 : 3); })

#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \

- (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \

+ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \

(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \

- (__v4di)__W); })

+ (__v4di)(W)); })

#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \

- (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \

+ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \

(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \

(__v4di)_mm256_setzero_si256()); })

@@ -8159,11 +8161,11 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)

(__v8di)(__m512i)(index), (__mmask8)-1, \

(int)(scale)); })

-#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\

- __addr, __scale) __extension__({\

-__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\

- __addr,(__v8di) __index, __mask, __scale);\

-})

+#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__({\

+ (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\

+ (float const *)(addr), \

+ (__v8di)(__m512i)(index), \

+ (__mmask8)(mask), (int)(scale)); })

#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\

(__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \

@@ -8858,6 +8860,8 @@ _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)

(__mmask16) -1);

}

+#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32

static __inline__ __m512i __DEFAULT_FN_ATTRS

_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,

__m512i __Y)

@@ -8868,6 +8872,8 @@ _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,

__M);

}

+#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32

static __inline__ __mmask16 __DEFAULT_FN_ATTRS

_mm512_kand (__mmask16 __A, __mmask16 __B)

{

@@ -8925,7 +8931,7 @@ _mm512_stream_si512 (__m512i * __P, __m512i __A)

static __inline__ __m512i __DEFAULT_FN_ATTRS

_mm512_stream_load_si512 (void *__P)

{

- return __builtin_ia32_movntdqa512 ((__v8di *)__P);

+ return (__m512i) __builtin_nontemporal_load((const __v8di *)__P);

}

static __inline__ void __DEFAULT_FN_ATTRS

@@ -9635,6 +9641,45 @@ _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)

}

#endif

+static __inline __m512i __DEFAULT_FN_ATTRS

+_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,

+ char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,

+ char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,

+ char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,

+ char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,

+ char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,

+ char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,

+ char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,

+ char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,

+ char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,

+ char __e4, char __e3, char __e2, char __e1, char __e0) {

+ return __extension__ (__m512i)(__v64qi)

+ {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,

+ __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,

+ __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,

+ __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,

+ __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,

+ __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,

+ __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,

+ __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};

+static __inline __m512i __DEFAULT_FN_ATTRS

+_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,

+ short __e27, short __e26, short __e25, short __e24, short __e23,

+ short __e22, short __e21, short __e20, short __e19, short __e18,

+ short __e17, short __e16, short __e15, short __e14, short __e13,

+ short __e12, short __e11, short __e10, short __e9, short __e8,

+ short __e7, short __e6, short __e5, short __e4, short __e3,

+ short __e2, short __e1, short __e0) {

+ return __extension__ (__m512i)(__v32hi)

+ {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,

+ __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,

+ __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,

+ __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };

static __inline __m512i __DEFAULT_FN_ATTRS

_mm512_set_epi32 (int __A, int __B, int __C, int __D,

int __E, int __F, int __G, int __H,

diff --git a/lib/Headers/avx512vldqintrin.h b/lib/Headers/avx512vldqintrin.h
index cd9da4370564..aecd7df34d05 100644
--- a/lib/Headers/avx512vldqintrin.h
+++ b/lib/Headers/avx512vldqintrin.h

@@ -1000,27 +1000,26 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)

}

static __inline__ __m256d __DEFAULT_FN_ATTRS

-_mm256_broadcast_f64x2 (__m128d __A)

+_mm256_broadcast_f64x2(__m128d __A)

{

- return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,

- (__v4df)_mm256_undefined_pd(),

- (__mmask8) -1);

+ return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,

+ 0, 1, 0, 1);

}

static __inline__ __m256d __DEFAULT_FN_ATTRS

-_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)

+_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)

{

- return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,

- (__v4df) __O,

- __M);

+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,

+ (__v4df)_mm256_broadcast_f64x2(__A),

+ (__v4df)__O);

}

static __inline__ __m256d __DEFAULT_FN_ATTRS

_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)

{

- return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,

- (__v4df) _mm256_setzero_ps (),

- __M);

+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,

+ (__v4df)_mm256_broadcast_f64x2(__A),

+ (__v4df)_mm256_setzero_pd());

}

static __inline__ __m128i __DEFAULT_FN_ATTRS

@@ -1072,27 +1071,26 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)

}

static __inline__ __m256i __DEFAULT_FN_ATTRS

-_mm256_broadcast_i64x2 (__m128i __A)

+_mm256_broadcast_i64x2(__m128i __A)

{

- return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,

- (__v4di)_mm256_undefined_si256(),

- (__mmask8) -1);

+ return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,

+ 0, 1, 0, 1);

}

static __inline__ __m256i __DEFAULT_FN_ATTRS

-_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)

+_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)

{

- return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,

- (__v4di) __O,

- __M);

+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,

+ (__v4di)_mm256_broadcast_i64x2(__A),

+ (__v4di)__O);

}

static __inline__ __m256i __DEFAULT_FN_ATTRS

_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)

{

- return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,

- (__v4di) _mm256_setzero_si256 (),

- __M);

+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,

+ (__v4di)_mm256_broadcast_i64x2(__A),

+ (__v4di)_mm256_setzero_si256());

}

#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \

diff --git a/lib/Headers/avx512vlintrin.h b/lib/Headers/avx512vlintrin.h
index f3744da6ab8a..99bb050de4d7 100644
--- a/lib/Headers/avx512vlintrin.h
+++ b/lib/Headers/avx512vlintrin.h

@@ -7189,52 +7189,49 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)

}

static __inline__ __m256 __DEFAULT_FN_ATTRS

-_mm256_broadcast_f32x4 (__m128 __A)

+_mm256_broadcast_f32x4(__m128 __A)

{

- return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,

- (__v8sf)_mm256_undefined_pd (),

- (__mmask8) -1);

+ return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,

+ 0, 1, 2, 3, 0, 1, 2, 3);

}

static __inline__ __m256 __DEFAULT_FN_ATTRS

-_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)

+_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)

{

- return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,

- (__v8sf) __O,

- __M);

+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,

+ (__v8sf)_mm256_broadcast_f32x4(__A),

+ (__v8sf)__O);

}

static __inline__ __m256 __DEFAULT_FN_ATTRS

_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)

{

- return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,

- (__v8sf) _mm256_setzero_ps (),

- __M);

+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,

+ (__v8sf)_mm256_broadcast_f32x4(__A),

+ (__v8sf)_mm256_setzero_ps());

}

static __inline__ __m256i __DEFAULT_FN_ATTRS

-_mm256_broadcast_i32x4 (__m128i __A)

+_mm256_broadcast_i32x4(__m128i __A)

{

- return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,

- (__v8si)_mm256_undefined_si256 (),

- (__mmask8) -1);

+ return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,

+ 0, 1, 2, 3, 0, 1, 2, 3);

}

static __inline__ __m256i __DEFAULT_FN_ATTRS

-_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)

+_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)

{

- return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,

- (__v8si)

- __O, __M);

+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,

+ (__v8si)_mm256_broadcast_i32x4(__A),

+ (__v8si)__O);

}

static __inline__ __m256i __DEFAULT_FN_ATTRS

-_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)

+_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)

{

- return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)

- __A,

- (__v8si) _mm256_setzero_si256 (),

- __M);

+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,

+ (__v8si)_mm256_broadcast_i32x4(__A),

+ (__v8si)_mm256_setzero_si256());

}

static __inline__ __m256d __DEFAULT_FN_ATTRS

diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h
index be03ba346031..5381878a5da3 100644
--- a/lib/Headers/avxintrin.h
+++ b/lib/Headers/avxintrin.h

@@ -1613,9 +1613,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)

#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */

#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */

#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */

-#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */

+#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */

#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */

-#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */

+#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */

#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */

#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */

#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */

@@ -1628,10 +1628,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)

#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */

#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */

#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */

-#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */

+#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */

#define _CMP_ORD_S 0x17 /* Ordered (signaling) */

#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */

-#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */

+#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */

#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */

#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */

#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */

@@ -1660,17 +1660,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)

/// \param c

/// An immediate integer operand, with bits [4:0] specifying which comparison

/// operation to use: \n

-/// 00h, 08h, 10h, 18h: Equal \n

-/// 01h, 09h, 11h, 19h: Less than \n

-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal

-/// (swapped operands) \n

-/// 03h, 0Bh, 13h, 1Bh: Unordered \n

-/// 04h, 0Ch, 14h, 1Ch: Not equal \n

-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than

-/// (swapped operands) \n

-/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal

-/// (swapped operands) \n

-/// 07h, 0Fh, 17h, 1Fh: Ordered

+/// 0x00 : Equal (ordered, non-signaling)

+/// 0x01 : Less-than (ordered, signaling)

+/// 0x02 : Less-than-or-equal (ordered, signaling)

+/// 0x03 : Unordered (non-signaling)

+/// 0x04 : Not-equal (unordered, non-signaling)

+/// 0x05 : Not-less-than (unordered, signaling)

+/// 0x06 : Not-less-than-or-equal (unordered, signaling)

+/// 0x07 : Ordered (non-signaling)

+/// 0x08 : Equal (unordered, non-signaling)

+/// 0x09 : Not-greater-than-or-equal (unordered, signaling)

+/// 0x0a : Not-greater-than (unordered, signaling)

+/// 0x0b : False (ordered, non-signaling)

+/// 0x0c : Not-equal (ordered, non-signaling)

+/// 0x0d : Greater-than-or-equal (ordered, signaling)

+/// 0x0e : Greater-than (ordered, signaling)

+/// 0x0f : True (unordered, non-signaling)

+/// 0x10 : Equal (ordered, signaling)

+/// 0x11 : Less-than (ordered, non-signaling)

+/// 0x12 : Less-than-or-equal (ordered, non-signaling)

+/// 0x13 : Unordered (signaling)

+/// 0x14 : Not-equal (unordered, signaling)

+/// 0x15 : Not-less-than (unordered, non-signaling)

+/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)

+/// 0x17 : Ordered (signaling)

+/// 0x18 : Equal (unordered, signaling)

+/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)

+/// 0x1a : Not-greater-than (unordered, non-signaling)

+/// 0x1b : False (ordered, signaling)

+/// 0x1c : Not-equal (ordered, signaling)

+/// 0x1d : Greater-than-or-equal (ordered, non-signaling)

+/// 0x1e : Greater-than (ordered, non-signaling)

+/// 0x1f : True (unordered, signaling)

/// \returns A 128-bit vector of [2 x double] containing the comparison results.

#define _mm_cmp_pd(a, b, c) __extension__ ({ \

(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \

@@ -1697,17 +1718,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)

/// \param c

/// An immediate integer operand, with bits [4:0] specifying which comparison

/// operation to use: \n

-/// 00h, 08h, 10h, 18h: Equal \n

-/// 01h, 09h, 11h, 19h: Less than \n

-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal

-/// (swapped operands) \n

-/// 03h, 0Bh, 13h, 1Bh: Unordered \n

-/// 04h, 0Ch, 14h, 1Ch: Not equal \n

-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than

-/// (swapped operands) \n

-/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal

-/// (swapped operands) \n

-/// 07h, 0Fh, 17h, 1Fh: Ordered

+/// 0x00 : Equal (ordered, non-signaling)

+/// 0x01 : Less-than (ordered, signaling)

+/// 0x02 : Less-than-or-equal (ordered, signaling)

+/// 0x03 : Unordered (non-signaling)

+/// 0x04 : Not-equal (unordered, non-signaling)

+/// 0x05 : Not-less-than (unordered, signaling)

+/// 0x06 : Not-less-than-or-equal (unordered, signaling)

+/// 0x07 : Ordered (non-signaling)

+/// 0x08 : Equal (unordered, non-signaling)

+/// 0x09 : Not-greater-than-or-equal (unordered, signaling)

+/// 0x0a : Not-greater-than (unordered, signaling)

+/// 0x0b : False (ordered, non-signaling)

+/// 0x0c : Not-equal (ordered, non-signaling)

+/// 0x0d : Greater-than-or-equal (ordered, signaling)

+/// 0x0e : Greater-than (ordered, signaling)

+/// 0x0f : True (unordered, non-signaling)

+/// 0x10 : Equal (ordered, signaling)

+/// 0x11 : Less-than (ordered, non-signaling)

+/// 0x12 : Less-than-or-equal (ordered, non-signaling)

+/// 0x13 : Unordered (signaling)

+/// 0x14 : Not-equal (unordered, signaling)

+/// 0x15 : Not-less-than (unordered, non-signaling)

+/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)

+/// 0x17 : Ordered (signaling)

+/// 0x18 : Equal (unordered, signaling)

+/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)

+/// 0x1a : Not-greater-than (unordered, non-signaling)

+/// 0x1b : False (ordered, signaling)

+/// 0x1c : Not-equal (ordered, signaling)

+/// 0x1d : Greater-than-or-equal (ordered, non-signaling)

+/// 0x1e : Greater-than (ordered, non-signaling)

+/// 0x1f : True (unordered, signaling)

/// \returns A 128-bit vector of [4 x float] containing the comparison results.

#define _mm_cmp_ps(a, b, c) __extension__ ({ \

(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \

@@ -1734,17 +1776,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)

/// \param c

/// An immediate integer operand, with bits [4:0] specifying which comparison

/// operation to use: \n

-/// 00h, 08h, 10h, 18h: Equal \n

-/// 01h, 09h, 11h, 19h: Less than \n

-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal

-/// (swapped operands) \n

-/// 03h, 0Bh, 13h, 1Bh: Unordered \n

-/// 04h, 0Ch, 14h, 1Ch: Not equal \n

-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than

-/// (swapped operands) \n

-/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal

-/// (swapped operands) \n

-/// 07h, 0Fh, 17h, 1Fh: Ordered

+/// 0x00 : Equal (ordered, non-signaling)

+/// 0x01 : Less-than (ordered, signaling)

+/// 0x02 : Less-than-or-equal (ordered, signaling)

+/// 0x03 : Unordered (non-signaling)

+/// 0x04 : Not-equal (unordered, non-signaling)

+/// 0x05 : Not-less-than (unordered, signaling)

+/// 0x06 : Not-less-than-or-equal (unordered, signaling)

+/// 0x07 : Ordered (non-signaling)

+/// 0x08 : Equal (unordered, non-signaling)

+/// 0x09 : Not-greater-than-or-equal (unordered, signaling)

+/// 0x0a : Not-greater-than (unordered, signaling)

+/// 0x0b : False (ordered, non-signaling)

+/// 0x0c : Not-equal (ordered, non-signaling)

+/// 0x0d : Greater-than-or-equal (ordered, signaling)

+/// 0x0e : Greater-than (ordered, signaling)

+/// 0x0f : True (unordered, non-signaling)

+/// 0x10 : Equal (ordered, signaling)

+/// 0x11 : Less-than (ordered, non-signaling)

+/// 0x12 : Less-than-or-equal (ordered, non-signaling)

+/// 0x13 : Unordered (signaling)

+/// 0x14 : Not-equal (unordered, signaling)

+/// 0x15 : Not-less-than (unordered, non-signaling)

+/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)

+/// 0x17 : Ordered (signaling)

+/// 0x18 : Equal (unordered, signaling)

+/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)

+/// 0x1a : Not-greater-than (unordered, non-signaling)

+/// 0x1b : False (ordered, signaling)

+/// 0x1c : Not-equal (ordered, signaling)

+/// 0x1d : Greater-than-or-equal (ordered, non-signaling)

+/// 0x1e : Greater-than (ordered, non-signaling)

+/// 0x1f : True (unordered, signaling)

/// \returns A 256-bit vector of [4 x double] containing the comparison results.

#define _mm256_cmp_pd(a, b, c) __extension__ ({ \

(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \

@@ -1771,17 +1834,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)

/// \param c

/// An immediate integer operand, with bits [4:0] specifying which comparison

/// operation to use: \n

-/// 00h, 08h, 10h, 18h: Equal \n

-/// 01h, 09h, 11h, 19h: Less than \n

-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal

-/// (swapped operands) \n

-/// 03h, 0Bh, 13h, 1Bh: Unordered \n

-/// 04h, 0Ch, 14h, 1Ch: Not equal \n

-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than

-/// (swapped operands) \n

-/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal

-/// (swapped operands) \n

-/// 07h, 0Fh, 17h, 1Fh: Ordered

+/// 0x00 : Equal (ordered, non-signaling)

+/// 0x01 : Less-than (ordered, signaling)

+/// 0x02 : Less-than-or-equal (ordered, signaling)

+/// 0x03 : Unordered (non-signaling)

+/// 0x04 : Not-equal (unordered, non-signaling)

+/// 0x05 : Not-less-than (unordered, signaling)

+/// 0x06 : Not-less-than-or-equal (unordered, signaling)

+/// 0x07 : Ordered (non-signaling)

+/// 0x08 : Equal (unordered, non-signaling)

+/// 0x09 : Not-greater-than-or-equal (unordered, signaling)

+/// 0x0a : Not-greater-than (unordered, signaling)

+/// 0x0b : False (ordered, non-signaling)

+/// 0x0c : Not-equal (ordered, non-signaling)

+/// 0x0d : Greater-than-or-equal (ordered, signaling)

+/// 0x0e : Greater-than (ordered, signaling)

+/// 0x0f : True (unordered, non-signaling)

+/// 0x10 : Equal (ordered, signaling)

+/// 0x11 : Less-than (ordered, non-signaling)

+/// 0x12 : Less-than-or-equal (ordered, non-signaling)

+/// 0x13 : Unordered (signaling)

+/// 0x14 : Not-equal (unordered, signaling)

+/// 0x15 : Not-less-than (unordered, non-signaling)

+/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)

+/// 0x17 : Ordered (signaling)

+/// 0x18 : Equal (unordered, signaling)

+/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)

+/// 0x1a : Not-greater-than (unordered, non-signaling)

+/// 0x1b : False (ordered, signaling)

+/// 0x1c : Not-equal (ordered, signaling)

+/// 0x1d : Greater-than-or-equal (ordered, non-signaling)

+/// 0x1e : Greater-than (ordered, non-signaling)

+/// 0x1f : True (unordered, signaling)

/// \returns A 256-bit vector of [8 x float] containing the comparison results.

#define _mm256_cmp_ps(a, b, c) __extension__ ({ \

(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \

@@ -1807,17 +1891,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)

/// \param c

/// An immediate integer operand, with bits [4:0] specifying which comparison

/// operation to use: \n

-/// 00h, 08h, 10h, 18h: Equal \n

-/// 01h, 09h, 11h, 19h: Less than \n

-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal

-/// (swapped operands) \n

-/// 03h, 0Bh, 13h, 1Bh: Unordered \n

-/// 04h, 0Ch, 14h, 1Ch: Not equal \n

-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than

-/// (swapped operands) \n

-/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal

-/// (swapped operands) \n

-/// 07h, 0Fh, 17h, 1Fh: Ordered

+/// 0x00 : Equal (ordered, non-signaling)

+/// 0x01 : Less-than (ordered, signaling)

+/// 0x02 : Less-than-or-equal (ordered, signaling)

+/// 0x03 : Unordered (non-signaling)

+/// 0x04 : Not-equal (unordered, non-signaling)

+/// 0x05 : Not-less-than (unordered, signaling)

+/// 0x06 : Not-less-than-or-equal (unordered, signaling)

+/// 0x07 : Ordered (non-signaling)

+/// 0x08 : Equal (unordered, non-signaling)

+/// 0x09 : Not-greater-than-or-equal (unordered, signaling)

+/// 0x0a : Not-greater-than (unordered, signaling)

+/// 0x0b : False (ordered, non-signaling)

+/// 0x0c : Not-equal (ordered, non-signaling)

+/// 0x0d : Greater-than-or-equal (ordered, signaling)

+/// 0x0e : Greater-than (ordered, signaling)

+/// 0x0f : True (unordered, non-signaling)

+/// 0x10 : Equal (ordered, signaling)

+/// 0x11 : Less-than (ordered, non-signaling)

+/// 0x12 : Less-than-or-equal (ordered, non-signaling)

+/// 0x13 : Unordered (signaling)

+/// 0x14 : Not-equal (unordered, signaling)

+/// 0x15 : Not-less-than (unordered, non-signaling)

+/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)

+/// 0x17 : Ordered (signaling)

+/// 0x18 : Equal (unordered, signaling)

+/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)

+/// 0x1a : Not-greater-than (unordered, non-signaling)

+/// 0x1b : False (ordered, signaling)

+/// 0x1c : Not-equal (ordered, signaling)

+/// 0x1d : Greater-than-or-equal (ordered, non-signaling)

+/// 0x1e : Greater-than (ordered, non-signaling)

+/// 0x1f : True (unordered, signaling)

/// \returns A 128-bit vector of [2 x double] containing the comparison results.

#define _mm_cmp_sd(a, b, c) __extension__ ({ \

(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \

@@ -1843,17 +1948,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)

/// \param c

/// An immediate integer operand, with bits [4:0] specifying which comparison

/// operation to use: \n

-/// 00h, 08h, 10h, 18h: Equal \n

-/// 01h, 09h, 11h, 19h: Less than \n

-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal

-/// (swapped operands) \n

-/// 03h, 0Bh, 13h, 1Bh: Unordered \n

-/// 04h, 0Ch, 14h, 1Ch: Not equal \n

-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than

-/// (swapped operands) \n

-/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal

-/// (swapped operands) \n

-/// 07h, 0Fh, 17h, 1Fh: Ordered

+/// 0x00 : Equal (ordered, non-signaling)

+/// 0x01 : Less-than (ordered, signaling)

+/// 0x02 : Less-than-or-equal (ordered, signaling)

+/// 0x03 : Unordered (non-signaling)

+/// 0x04 : Not-equal (unordered, non-signaling)

+/// 0x05 : Not-less-than (unordered, signaling)

+/// 0x06 : Not-less-than-or-equal (unordered, signaling)

+/// 0x07 : Ordered (non-signaling)

+/// 0x08 : Equal (unordered, non-signaling)

+/// 0x09 : Not-greater-than-or-equal (unordered, signaling)

+/// 0x0a : Not-greater-than (unordered, signaling)

+/// 0x0b : False (ordered, non-signaling)

+/// 0x0c : Not-equal (ordered, non-signaling)

+/// 0x0d : Greater-than-or-equal (ordered, signaling)

+/// 0x0e : Greater-than (ordered, signaling)

+/// 0x0f : True (unordered, non-signaling)

+/// 0x10 : Equal (ordered, signaling)

+/// 0x11 : Less-than (ordered, non-signaling)

+/// 0x12 : Less-than-or-equal (ordered, non-signaling)

+/// 0x13 : Unordered (signaling)

+/// 0x14 : Not-equal (unordered, signaling)

+/// 0x15 : Not-less-than (unordered, non-signaling)

+/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)

+/// 0x17 : Ordered (signaling)

+/// 0x18 : Equal (unordered, signaling)

+/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)

+/// 0x1a : Not-greater-than (unordered, non-signaling)

+/// 0x1b : False (ordered, signaling)

+/// 0x1c : Not-equal (ordered, signaling)

+/// 0x1d : Greater-than-or-equal (ordered, non-signaling)

+/// 0x1e : Greater-than (ordered, non-signaling)

+/// 0x1f : True (unordered, signaling)

/// \returns A 128-bit vector of [4 x float] containing the comparison results.

#define _mm_cmp_ss(a, b, c) __extension__ ({ \

(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \

@@ -2184,12 +2310,32 @@ _mm256_cvttps_epi32(__m256 __a)

return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);

}

+/// \brief Returns the first element of the input vector of [4 x double].

+///

+/// \headerfile <avxintrin.h>

+///

+/// This intrinsic is a utility function and does not correspond to a specific

+/// instruction.

+///

+/// \param __a

+/// A 256-bit vector of [4 x double].

+/// \returns A 64 bit double containing the first element of the input vector.

static __inline double __DEFAULT_FN_ATTRS

_mm256_cvtsd_f64(__m256d __a)

{

return __a[0];

}

+/// \brief Returns the first element of the input vector of [8 x i32].

+///

+/// \headerfile <avxintrin.h>

+///

+/// This intrinsic is a utility function and does not correspond to a specific

+/// instruction.

+///

+/// \param __a

+/// A 256-bit vector of [8 x i32].

+/// \returns A 32 bit integer containing the first element of the input vector.

static __inline int __DEFAULT_FN_ATTRS

_mm256_cvtsi256_si32(__m256i __a)

{

@@ -2197,6 +2343,16 @@ _mm256_cvtsi256_si32(__m256i __a)

return __b[0];

}

+/// \brief Returns the first element of the input vector of [8 x float].

+///

+/// \headerfile <avxintrin.h>

+///

+/// This intrinsic is a utility function and does not correspond to a specific

+/// instruction.

+///

+/// \param __a

+/// A 256-bit vector of [8 x float].

+/// \returns A 32 bit float containing the first element of the input vector.

static __inline float __DEFAULT_FN_ATTRS

_mm256_cvtss_f32(__m256 __a)

{

diff --git a/lib/Headers/clzerointrin.h b/lib/Headers/clzerointrin.h
new file mode 100644
index 000000000000..ed7478ff87ea
--- /dev/null
+++ b/lib/Headers/clzerointrin.h

@@ -0,0 +1,50 @@

+/*===----------------------- clzerointrin.h - CLZERO ----------------------===

+ *

+ * Permission is hereby granted, free of charge, to any person obtaining a copy

+ * of this software and associated documentation files (the "Software"), to deal

+ * in the Software without restriction, including without limitation the rights

+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

+ * copies of the Software, and to permit persons to whom the Software is

+ * furnished to do so, subject to the following conditions:

+ *

+ * The above copyright notice and this permission notice shall be included in

+ * all copies or substantial portions of the Software.

+ *

+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN

+ * THE SOFTWARE.

+ *

+ *===-----------------------------------------------------------------------===

+ */

+#ifndef __X86INTRIN_H

+#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."

+#endif

+#ifndef _CLZEROINTRIN_H

+#define _CLZEROINTRIN_H

+/* Define the default attributes for the functions in this file. */

+#define __DEFAULT_FN_ATTRS \

+ __attribute__((__always_inline__, __nodebug__, __target__("clzero")))

+/// \brief Loads the cache line address and zero's out the cacheline

+///

+/// \headerfile <clzerointrin.h>

+///

+/// This intrinsic corresponds to the <c> CLZERO </c> instruction.

+///

+/// \param __line

+/// A pointer to a cacheline which needs to be zeroed out.

+static __inline__ void __DEFAULT_FN_ATTRS

+_mm_clzero (void * __line)

+ __builtin_ia32_clzero ((void *)__line);

+#undef __DEFAULT_FN_ATTRS

+#endif /* _CLZEROINTRIN_H */

diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index 1512f9f0b47b..0dfa6a9fbc1f 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h

@@ -1599,6 +1599,17 @@ _mm_loadu_pd(double const *__dp)

return ((struct __loadu_pd*)__dp)->__v;

}

+/// \brief Loads a 64-bit integer value to the low element of a 128-bit integer

+/// vector and clears the upper element.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.

+///

+/// \param __dp

+/// A pointer to a 64-bit memory location. The address of the memory

+/// location does not have to be aligned.

+/// \returns A 128-bit vector of [2 x i64] containing the loaded value.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_loadu_si64(void const *__a)

{

@@ -1609,6 +1620,17 @@ _mm_loadu_si64(void const *__a)

return (__m128i){__u, 0L};

}

+/// \brief Loads a 64-bit double-precision value to the low element of a

+/// 128-bit integer vector and clears the upper element.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.

+///

+/// \param __dp

+/// An pointer to a memory location containing a double-precision value.

+/// The address of the memory location does not have to be aligned.

+/// \returns A 128-bit vector of [2 x double] containing the loaded value.

static __inline__ __m128d __DEFAULT_FN_ATTRS

_mm_load_sd(double const *__dp)

{

@@ -1787,7 +1809,7 @@ _mm_setzero_pd(void)

/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower

/// 64 bits are set to the lower 64 bits of the second parameter. The upper

/// 64 bits are set to the upper 64 bits of the first parameter.

-//

+///

/// \headerfile <x86intrin.h>

///

/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.

@@ -2369,7 +2391,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b)

/// \brief Computes the absolute differences of corresponding 8-bit integer

/// values in two 128-bit vectors. Sums the first 8 absolute differences, and

-/// separately sums the second 8 absolute differences. Packss these two

+/// separately sums the second 8 absolute differences. Packs these two

/// unsigned 16-bit integer sums into the upper and lower elements of a

/// [2 x i64] vector.

///

@@ -4019,7 +4041,7 @@ extern "C" {

/// \param __p

/// A pointer to the memory location used to identify the cache line to be

/// flushed.

-void _mm_clflush(void const *);

+void _mm_clflush(void const * __p);

/// \brief Forces strong memory ordering (serialization) between load

/// instructions preceding this instruction and load instructions following

@@ -4141,7 +4163,7 @@ _mm_packus_epi16(__m128i __a, __m128i __b)

/// \param __a

/// A 128-bit integer vector.

/// \param __imm

-/// An immediate value. Bits [3:0] selects values from \a __a to be assigned

+/// An immediate value. Bits [2:0] selects values from \a __a to be assigned

/// to bits[15:0] of the result. \n

/// 000: assign values from bits [15:0] of \a __a. \n

/// 001: assign values from bits [31:16] of \a __a. \n

@@ -4788,4 +4810,12 @@ void _mm_pause(void);

#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))

+#define _MM_DENORMALS_ZERO_ON (0x0040)

+#define _MM_DENORMALS_ZERO_OFF (0x0000)

+#define _MM_DENORMALS_ZERO_MASK (0x0040)

+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)

+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))

#endif /* __EMMINTRIN_H */

diff --git a/lib/Headers/f16cintrin.h b/lib/Headers/f16cintrin.h
index 180712ffc680..b796cc84316f 100644
--- a/lib/Headers/f16cintrin.h
+++ b/lib/Headers/f16cintrin.h

@@ -72,9 +72,9 @@ _cvtsh_ss(unsigned short __a)

/// 011: Truncate \n

/// 1XX: Use MXCSR.RC for rounding

/// \returns The converted 16-bit half-precision float value.

-#define _cvtss_sh(a, imm) \

- ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \

- (imm)))[0]))

+#define _cvtss_sh(a, imm) __extension__ ({ \

+ (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \

+ (imm)))[0]); })

/// \brief Converts a 128-bit vector containing 32-bit float values into a

/// 128-bit vector containing 16-bit half-precision float values.

@@ -99,8 +99,8 @@ _cvtsh_ss(unsigned short __a)

/// \returns A 128-bit vector containing converted 16-bit half-precision float

/// values. The lower 64 bits are used to store the converted 16-bit

/// half-precision floating-point values.

-#define _mm_cvtps_ph(a, imm) \

- ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))

+#define _mm_cvtps_ph(a, imm) __extension__ ({ \

+ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })

/// \brief Converts a 128-bit vector containing 16-bit half-precision float

/// values into a 128-bit vector containing 32-bit float values.

diff --git a/lib/Headers/htmxlintrin.h b/lib/Headers/htmxlintrin.h
index 16dc7056c6b0..28f7d025bb30 100644
--- a/lib/Headers/htmxlintrin.h
+++ b/lib/Headers/htmxlintrin.h

@@ -35,14 +35,10 @@

extern "C" {

#endif

-#define _TEXASR_PTR(TM_BUF) \

- ((texasr_t *)((TM_BUF)+0))

-#define _TEXASRU_PTR(TM_BUF) \

- ((texasru_t *)((TM_BUF)+0))

-#define _TEXASRL_PTR(TM_BUF) \

- ((texasrl_t *)((TM_BUF)+4))

-#define _TFIAR_PTR(TM_BUF) \

- ((tfiar_t *)((TM_BUF)+8))

+#define _TEXASR_PTR(TM_BUF) ((texasr_t *)((char *)(TM_BUF) + 0))

+#define _TEXASRU_PTR(TM_BUF) ((texasru_t *)((char *)(TM_BUF) + 0))

+#define _TEXASRL_PTR(TM_BUF) ((texasrl_t *)((char *)(TM_BUF) + 4))

+#define _TFIAR_PTR(TM_BUF) ((tfiar_t *)((char *)(TM_BUF) + 8))

typedef char TM_buff_type[16];

@@ -178,7 +174,7 @@ extern __inline long

__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))

__TM_is_conflict(void* const __TM_buff)

{

- texasru_t texasru = *_TEXASRU_PTR (TM_buff);

+ texasru_t texasru = *_TEXASRU_PTR (__TM_buff);

/* Return TEXASR bits 11 (Self-Induced Conflict) through

14 (Translation Invalidation Conflict). */

return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;

diff --git a/lib/Headers/intrin.h b/lib/Headers/intrin.h
index a35262af846a..38d9407abed9 100644
--- a/lib/Headers/intrin.h
+++ b/lib/Headers/intrin.h

@@ -69,7 +69,6 @@ static __inline__

__int64 __emul(int, int);

static __inline__

unsigned __int64 __emulu(unsigned int, unsigned int);

-void __cdecl __fastfail(unsigned int);

unsigned int __getcallerseflags(void);

static __inline__

void __halt(void);

@@ -80,7 +79,6 @@ void __incfsdword(unsigned long);

void __incfsword(unsigned long);

unsigned long __indword(unsigned short);

void __indwordstring(unsigned short, unsigned long *, unsigned long);

-void __int2c(void);

void __invlpg(void *);

unsigned short __inword(unsigned short);

void __inwordstring(unsigned short, unsigned short *, unsigned long);

@@ -142,7 +140,6 @@ void __svm_stgi(void);

void __svm_vmload(size_t);

void __svm_vmrun(size_t);

void __svm_vmsave(size_t);

-void __ud2(void);

unsigned __int64 __ull_rshift(unsigned __int64, int);

void __vmx_off(void);

void __vmx_vmptrst(unsigned __int64 *);

@@ -176,7 +173,6 @@ void __cdecl _disable(void);

void __cdecl _enable(void);

long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);

unsigned char _interlockedbittestandreset(long volatile *, long);

-static __inline__

unsigned char _interlockedbittestandset(long volatile *, long);

long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);

long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);

@@ -372,11 +368,6 @@ _bittestandset(long *_BitBase, long _BitPos) {

*_BitBase = *_BitBase | (1 << _BitPos);

return _Res;

}

-static __inline__ unsigned char __DEFAULT_FN_ATTRS

-_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {

- long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);

- return (_PrevVal >> _BitPos) & 1;

#if defined(__arm__) || defined(__aarch64__)

static __inline__ unsigned char __DEFAULT_FN_ATTRS

_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {

@@ -872,48 +863,7 @@ _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,

return _Comparand;

}

#endif

-/*----------------------------------------------------------------------------*\

-|* readfs, readgs

-|* (Pointers in address space #256 and #257 are relative to the GS and FS

-|* segment registers, respectively.)

-\*----------------------------------------------------------------------------*/

-#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset) \

- ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \

- (__offset))

-#ifdef __i386__

-static __inline__ unsigned char __DEFAULT_FN_ATTRS

-__readfsbyte(unsigned long __offset) {

- return *__ptr_to_addr_space(257, unsigned char, __offset);

-static __inline__ unsigned short __DEFAULT_FN_ATTRS

-__readfsword(unsigned long __offset) {

- return *__ptr_to_addr_space(257, unsigned short, __offset);

-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS

-__readfsqword(unsigned long __offset) {

- return *__ptr_to_addr_space(257, unsigned __int64, __offset);

-#endif

-#ifdef __x86_64__

-static __inline__ unsigned char __DEFAULT_FN_ATTRS

-__readgsbyte(unsigned long __offset) {

- return *__ptr_to_addr_space(256, unsigned char, __offset);

-static __inline__ unsigned short __DEFAULT_FN_ATTRS

-__readgsword(unsigned long __offset) {

- return *__ptr_to_addr_space(256, unsigned short, __offset);

-static __inline__ unsigned long __DEFAULT_FN_ATTRS

-__readgsdword(unsigned long __offset) {

- return *__ptr_to_addr_space(256, unsigned long, __offset);

-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS

-__readgsqword(unsigned long __offset) {

- return *__ptr_to_addr_space(256, unsigned __int64, __offset);

-#endif

-#undef __ptr_to_addr_space

/*----------------------------------------------------------------------------*\

|* movs, stos

\*----------------------------------------------------------------------------*/

diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h
index e0c277a65a33..2b3618398cbf 100644
--- a/lib/Headers/mmintrin.h
+++ b/lib/Headers/mmintrin.h

@@ -211,7 +211,7 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)

/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.

///

/// \param __m1

-/// A 64-bit integer vector of [8 x i8]. \n

+/// A 64-bit integer vector of [8 x i8]. \n

/// Bits [39:32] are written to bits [7:0] of the result. \n

/// Bits [47:40] are written to bits [23:16] of the result. \n

/// Bits [55:48] are written to bits [39:32] of the result. \n

diff --git a/lib/Headers/module.modulemap b/lib/Headers/module.modulemap
index 11ef2f902945..95d26cefa6f7 100644
--- a/lib/Headers/module.modulemap
+++ b/lib/Headers/module.modulemap

@@ -61,6 +61,7 @@ module _Builtin_intrinsics [system] [extern_c] {

textual header "xopintrin.h"

textual header "fma4intrin.h"

textual header "mwaitxintrin.h"

+ textual header "clzerointrin.h"

explicit module mm_malloc {

requires !freestanding

diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h
index 0c25d312709d..6452d5c987f0 100644
--- a/lib/Headers/opencl-c.h
+++ b/lib/Headers/opencl-c.h

@@ -16,6 +16,12 @@

#endif //cl_khr_depth_images

#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0

+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0

+#ifdef cl_khr_3d_image_writes

+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable

+#endif //cl_khr_3d_image_writes

+#endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0

#define __ovld __attribute__((overloadable))

#define __conv __attribute__((convergent))

@@ -6578,777 +6584,85 @@ half16 __ovld __cnfn convert_half16_rtz(double16);

* OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators

* Reinterprets a data type as another data type of the same size

-char __ovld __cnfn as_char(char);

-char __ovld __cnfn as_char(uchar);

-char2 __ovld __cnfn as_char2(char2);

-char2 __ovld __cnfn as_char2(uchar2);

-char2 __ovld __cnfn as_char2(short);

-char2 __ovld __cnfn as_char2(ushort);

-char3 __ovld __cnfn as_char3(char3);

-char3 __ovld __cnfn as_char3(char4);

-char3 __ovld __cnfn as_char3(uchar3);

-char3 __ovld __cnfn as_char3(uchar4);

-char3 __ovld __cnfn as_char3(short2);

-char3 __ovld __cnfn as_char3(ushort2);

-char3 __ovld __cnfn as_char3(int);

-char3 __ovld __cnfn as_char3(uint);

-char3 __ovld __cnfn as_char3(float);

-char4 __ovld __cnfn as_char4(char3);

-char4 __ovld __cnfn as_char4(char4);

-char4 __ovld __cnfn as_char4(uchar3);

-char4 __ovld __cnfn as_char4(uchar4);

-char4 __ovld __cnfn as_char4(short2);

-char4 __ovld __cnfn as_char4(ushort2);

-char4 __ovld __cnfn as_char4(int);

-char4 __ovld __cnfn as_char4(uint);

-char4 __ovld __cnfn as_char4(float);

-char8 __ovld __cnfn as_char8(char8);

-char8 __ovld __cnfn as_char8(uchar8);

-char8 __ovld __cnfn as_char8(short3);

-char8 __ovld __cnfn as_char8(short4);

-char8 __ovld __cnfn as_char8(ushort3);

-char8 __ovld __cnfn as_char8(ushort4);

-char8 __ovld __cnfn as_char8(int2);

-char8 __ovld __cnfn as_char8(uint2);

-char8 __ovld __cnfn as_char8(long);

-char8 __ovld __cnfn as_char8(ulong);

-char8 __ovld __cnfn as_char8(float2);

-char16 __ovld __cnfn as_char16(char16);

-char16 __ovld __cnfn as_char16(uchar16);

-char16 __ovld __cnfn as_char16(short8);

-char16 __ovld __cnfn as_char16(ushort8);

-char16 __ovld __cnfn as_char16(int3);

-char16 __ovld __cnfn as_char16(int4);

-char16 __ovld __cnfn as_char16(uint3);

-char16 __ovld __cnfn as_char16(uint4);

-char16 __ovld __cnfn as_char16(long2);

-char16 __ovld __cnfn as_char16(ulong2);

-char16 __ovld __cnfn as_char16(float3);

-char16 __ovld __cnfn as_char16(float4);

-uchar __ovld __cnfn as_uchar(char);

-uchar __ovld __cnfn as_uchar(uchar);

-uchar2 __ovld __cnfn as_uchar2(char2);

-uchar2 __ovld __cnfn as_uchar2(uchar2);

-uchar2 __ovld __cnfn as_uchar2(short);

-uchar2 __ovld __cnfn as_uchar2(ushort);

-uchar3 __ovld __cnfn as_uchar3(char3);

-uchar3 __ovld __cnfn as_uchar3(char4);

-uchar3 __ovld __cnfn as_uchar3(uchar3);

-uchar3 __ovld __cnfn as_uchar3(uchar4);

-uchar3 __ovld __cnfn as_uchar3(short2);

-uchar3 __ovld __cnfn as_uchar3(ushort2);

-uchar3 __ovld __cnfn as_uchar3(int);

-uchar3 __ovld __cnfn as_uchar3(uint);

-uchar3 __ovld __cnfn as_uchar3(float);

-uchar4 __ovld __cnfn as_uchar4(char3);

-uchar4 __ovld __cnfn as_uchar4(char4);

-uchar4 __ovld __cnfn as_uchar4(uchar3);

-uchar4 __ovld __cnfn as_uchar4(uchar4);

-uchar4 __ovld __cnfn as_uchar4(short2);

-uchar4 __ovld __cnfn as_uchar4(ushort2);

-uchar4 __ovld __cnfn as_uchar4(int);

-uchar4 __ovld __cnfn as_uchar4(uint);

-uchar4 __ovld __cnfn as_uchar4(float);

-uchar8 __ovld __cnfn as_uchar8(char8);

-uchar8 __ovld __cnfn as_uchar8(uchar8);

-uchar8 __ovld __cnfn as_uchar8(short3);

-uchar8 __ovld __cnfn as_uchar8(short4);

-uchar8 __ovld __cnfn as_uchar8(ushort3);

-uchar8 __ovld __cnfn as_uchar8(ushort4);

-uchar8 __ovld __cnfn as_uchar8(int2);

-uchar8 __ovld __cnfn as_uchar8(uint2);

-uchar8 __ovld __cnfn as_uchar8(long);

-uchar8 __ovld __cnfn as_uchar8(ulong);

-uchar8 __ovld __cnfn as_uchar8(float2);

-uchar16 __ovld __cnfn as_uchar16(char16);

-uchar16 __ovld __cnfn as_uchar16(uchar16);

-uchar16 __ovld __cnfn as_uchar16(short8);

-uchar16 __ovld __cnfn as_uchar16(ushort8);

-uchar16 __ovld __cnfn as_uchar16(int3);

-uchar16 __ovld __cnfn as_uchar16(int4);

-uchar16 __ovld __cnfn as_uchar16(uint3);

-uchar16 __ovld __cnfn as_uchar16(uint4);

-uchar16 __ovld __cnfn as_uchar16(long2);

-uchar16 __ovld __cnfn as_uchar16(ulong2);

-uchar16 __ovld __cnfn as_uchar16(float3);

-uchar16 __ovld __cnfn as_uchar16(float4);

-short __ovld __cnfn as_short(char2);

-short __ovld __cnfn as_short(uchar2);

-short __ovld __cnfn as_short(short);

-short __ovld __cnfn as_short(ushort);

-short2 __ovld __cnfn as_short2(char3);

-short2 __ovld __cnfn as_short2(char4);

-short2 __ovld __cnfn as_short2(uchar3);

-short2 __ovld __cnfn as_short2(uchar4);

-short2 __ovld __cnfn as_short2(short2);

-short2 __ovld __cnfn as_short2(ushort2);

-short2 __ovld __cnfn as_short2(int);

-short2 __ovld __cnfn as_short2(uint);

-short2 __ovld __cnfn as_short2(float);

-short3 __ovld __cnfn as_short3(char8);

-short3 __ovld __cnfn as_short3(uchar8);

-short3 __ovld __cnfn as_short3(short3);

-short3 __ovld __cnfn as_short3(short4);

-short3 __ovld __cnfn as_short3(ushort3);

-short3 __ovld __cnfn as_short3(ushort4);

-short3 __ovld __cnfn as_short3(int2);

-short3 __ovld __cnfn as_short3(uint2);

-short3 __ovld __cnfn as_short3(long);

-short3 __ovld __cnfn as_short3(ulong);

-short3 __ovld __cnfn as_short3(float2);

-short4 __ovld __cnfn as_short4(char8);

-short4 __ovld __cnfn as_short4(uchar8);

-short4 __ovld __cnfn as_short4(short3);

-short4 __ovld __cnfn as_short4(short4);

-short4 __ovld __cnfn as_short4(ushort3);

-short4 __ovld __cnfn as_short4(ushort4);

-short4 __ovld __cnfn as_short4(int2);

-short4 __ovld __cnfn as_short4(uint2);

-short4 __ovld __cnfn as_short4(long);

-short4 __ovld __cnfn as_short4(ulong);

-short4 __ovld __cnfn as_short4(float2);

-short8 __ovld __cnfn as_short8(char16);

-short8 __ovld __cnfn as_short8(uchar16);

-short8 __ovld __cnfn as_short8(short8);

-short8 __ovld __cnfn as_short8(ushort8);

-short8 __ovld __cnfn as_short8(int3);

-short8 __ovld __cnfn as_short8(int4);

-short8 __ovld __cnfn as_short8(uint3);

-short8 __ovld __cnfn as_short8(uint4);

-short8 __ovld __cnfn as_short8(long2);

-short8 __ovld __cnfn as_short8(ulong2);

-short8 __ovld __cnfn as_short8(float3);

-short8 __ovld __cnfn as_short8(float4);

-short16 __ovld __cnfn as_short16(short16);

-short16 __ovld __cnfn as_short16(ushort16);

-short16 __ovld __cnfn as_short16(int8);

-short16 __ovld __cnfn as_short16(uint8);

-short16 __ovld __cnfn as_short16(long3);

-short16 __ovld __cnfn as_short16(long4);

-short16 __ovld __cnfn as_short16(ulong3);

-short16 __ovld __cnfn as_short16(ulong4);

-short16 __ovld __cnfn as_short16(float8);

-ushort __ovld __cnfn as_ushort(char2);

-ushort __ovld __cnfn as_ushort(uchar2);

-ushort __ovld __cnfn as_ushort(short);

-ushort __ovld __cnfn as_ushort(ushort);

-ushort2 __ovld __cnfn as_ushort2(char3);

-ushort2 __ovld __cnfn as_ushort2(char4);

-ushort2 __ovld __cnfn as_ushort2(uchar3);

-ushort2 __ovld __cnfn as_ushort2(uchar4);

-ushort2 __ovld __cnfn as_ushort2(short2);

-ushort2 __ovld __cnfn as_ushort2(ushort2);

-ushort2 __ovld __cnfn as_ushort2(int);

-ushort2 __ovld __cnfn as_ushort2(uint);

-ushort2 __ovld __cnfn as_ushort2(float);

-ushort3 __ovld __cnfn as_ushort3(char8);

-ushort3 __ovld __cnfn as_ushort3(uchar8);

-ushort3 __ovld __cnfn as_ushort3(short3);

-ushort3 __ovld __cnfn as_ushort3(short4);

-ushort3 __ovld __cnfn as_ushort3(ushort3);

-ushort3 __ovld __cnfn as_ushort3(ushort4);

-ushort3 __ovld __cnfn as_ushort3(int2);

-ushort3 __ovld __cnfn as_ushort3(uint2);

-ushort3 __ovld __cnfn as_ushort3(long);

-ushort3 __ovld __cnfn as_ushort3(ulong);

-ushort3 __ovld __cnfn as_ushort3(float2);

-ushort4 __ovld __cnfn as_ushort4(char8);

-ushort4 __ovld __cnfn as_ushort4(uchar8);

-ushort4 __ovld __cnfn as_ushort4(short3);

-ushort4 __ovld __cnfn as_ushort4(short4);

-ushort4 __ovld __cnfn as_ushort4(ushort3);

-ushort4 __ovld __cnfn as_ushort4(ushort4);

-ushort4 __ovld __cnfn as_ushort4(int2);

-ushort4 __ovld __cnfn as_ushort4(uint2);

-ushort4 __ovld __cnfn as_ushort4(long);

-ushort4 __ovld __cnfn as_ushort4(ulong);

-ushort4 __ovld __cnfn as_ushort4(float2);

-ushort8 __ovld __cnfn as_ushort8(char16);

-ushort8 __ovld __cnfn as_ushort8(uchar16);

-ushort8 __ovld __cnfn as_ushort8(short8);

-ushort8 __ovld __cnfn as_ushort8(ushort8);

-ushort8 __ovld __cnfn as_ushort8(int3);

-ushort8 __ovld __cnfn as_ushort8(int4);

-ushort8 __ovld __cnfn as_ushort8(uint3);

-ushort8 __ovld __cnfn as_ushort8(uint4);

-ushort8 __ovld __cnfn as_ushort8(long2);

-ushort8 __ovld __cnfn as_ushort8(ulong2);

-ushort8 __ovld __cnfn as_ushort8(float3);

-ushort8 __ovld __cnfn as_ushort8(float4);

-ushort16 __ovld __cnfn as_ushort16(short16);

-ushort16 __ovld __cnfn as_ushort16(ushort16);

-ushort16 __ovld __cnfn as_ushort16(int8);

-ushort16 __ovld __cnfn as_ushort16(uint8);

-ushort16 __ovld __cnfn as_ushort16(long3);

-ushort16 __ovld __cnfn as_ushort16(long4);

-ushort16 __ovld __cnfn as_ushort16(ulong3);

-ushort16 __ovld __cnfn as_ushort16(ulong4);

-ushort16 __ovld __cnfn as_ushort16(float8);

-int __ovld __cnfn as_int(char3);

-int __ovld __cnfn as_int(char4);

-int __ovld __cnfn as_int(uchar3);

-int __ovld __cnfn as_int(uchar4);

-int __ovld __cnfn as_int(short2);

-int __ovld __cnfn as_int(ushort2);

-int __ovld __cnfn as_int(int);

-int __ovld __cnfn as_int(uint);

-int __ovld __cnfn as_int(float);

-int2 __ovld __cnfn as_int2(char8);

-int2 __ovld __cnfn as_int2(uchar8);

-int2 __ovld __cnfn as_int2(short3);

-int2 __ovld __cnfn as_int2(short4);

-int2 __ovld __cnfn as_int2(ushort3);

-int2 __ovld __cnfn as_int2(ushort4);

-int2 __ovld __cnfn as_int2(int2);

-int2 __ovld __cnfn as_int2(uint2);

-int2 __ovld __cnfn as_int2(long);

-int2 __ovld __cnfn as_int2(ulong);

-int2 __ovld __cnfn as_int2(float2);

-int3 __ovld __cnfn as_int3(char16);

-int3 __ovld __cnfn as_int3(uchar16);

-int3 __ovld __cnfn as_int3(short8);

-int3 __ovld __cnfn as_int3(ushort8);

-int3 __ovld __cnfn as_int3(int3);

-int3 __ovld __cnfn as_int3(int4);

-int3 __ovld __cnfn as_int3(uint3);

-int3 __ovld __cnfn as_int3(uint4);

-int3 __ovld __cnfn as_int3(long2);

-int3 __ovld __cnfn as_int3(ulong2);

-int3 __ovld __cnfn as_int3(float3);

-int3 __ovld __cnfn as_int3(float4);

-int4 __ovld __cnfn as_int4(char16);

-int4 __ovld __cnfn as_int4(uchar16);

-int4 __ovld __cnfn as_int4(short8);

-int4 __ovld __cnfn as_int4(ushort8);

-int4 __ovld __cnfn as_int4(int3);

-int4 __ovld __cnfn as_int4(int4);

-int4 __ovld __cnfn as_int4(uint3);

-int4 __ovld __cnfn as_int4(uint4);

-int4 __ovld __cnfn as_int4(long2);

-int4 __ovld __cnfn as_int4(ulong2);

-int4 __ovld __cnfn as_int4(float3);

-int4 __ovld __cnfn as_int4(float4);

-int8 __ovld __cnfn as_int8(short16);

-int8 __ovld __cnfn as_int8(ushort16);

-int8 __ovld __cnfn as_int8(int8);

-int8 __ovld __cnfn as_int8(uint8);

-int8 __ovld __cnfn as_int8(long3);

-int8 __ovld __cnfn as_int8(long4);

-int8 __ovld __cnfn as_int8(ulong3);

-int8 __ovld __cnfn as_int8(ulong4);

-int8 __ovld __cnfn as_int8(float8);

-int16 __ovld __cnfn as_int16(int16);

-int16 __ovld __cnfn as_int16(uint16);

-int16 __ovld __cnfn as_int16(long8);

-int16 __ovld __cnfn as_int16(ulong8);

-int16 __ovld __cnfn as_int16(float16);

-uint __ovld __cnfn as_uint(char3);

-uint __ovld __cnfn as_uint(char4);

-uint __ovld __cnfn as_uint(uchar3);

-uint __ovld __cnfn as_uint(uchar4);

-uint __ovld __cnfn as_uint(short2);

-uint __ovld __cnfn as_uint(ushort2);

-uint __ovld __cnfn as_uint(int);

-uint __ovld __cnfn as_uint(uint);

-uint __ovld __cnfn as_uint(float);

-uint2 __ovld __cnfn as_uint2(char8);

-uint2 __ovld __cnfn as_uint2(uchar8);

-uint2 __ovld __cnfn as_uint2(short3);

-uint2 __ovld __cnfn as_uint2(short4);

-uint2 __ovld __cnfn as_uint2(ushort3);

-uint2 __ovld __cnfn as_uint2(ushort4);

-uint2 __ovld __cnfn as_uint2(int2);

-uint2 __ovld __cnfn as_uint2(uint2);

-uint2 __ovld __cnfn as_uint2(long);

-uint2 __ovld __cnfn as_uint2(ulong);

-uint2 __ovld __cnfn as_uint2(float2);

-uint3 __ovld __cnfn as_uint3(char16);

-uint3 __ovld __cnfn as_uint3(uchar16);

-uint3 __ovld __cnfn as_uint3(short8);

-uint3 __ovld __cnfn as_uint3(ushort8);

-uint3 __ovld __cnfn as_uint3(int3);

-uint3 __ovld __cnfn as_uint3(int4);

-uint3 __ovld __cnfn as_uint3(uint3);

-uint3 __ovld __cnfn as_uint3(uint4);

-uint3 __ovld __cnfn as_uint3(long2);

-uint3 __ovld __cnfn as_uint3(ulong2);

-uint3 __ovld __cnfn as_uint3(float3);

-uint3 __ovld __cnfn as_uint3(float4);

-uint4 __ovld __cnfn as_uint4(char16);

-uint4 __ovld __cnfn as_uint4(uchar16);

-uint4 __ovld __cnfn as_uint4(short8);

-uint4 __ovld __cnfn as_uint4(ushort8);

-uint4 __ovld __cnfn as_uint4(int3);

-uint4 __ovld __cnfn as_uint4(int4);

-uint4 __ovld __cnfn as_uint4(uint3);

-uint4 __ovld __cnfn as_uint4(uint4);

-uint4 __ovld __cnfn as_uint4(long2);

-uint4 __ovld __cnfn as_uint4(ulong2);

-uint4 __ovld __cnfn as_uint4(float3);

-uint4 __ovld __cnfn as_uint4(float4);

-uint8 __ovld __cnfn as_uint8(short16);

-uint8 __ovld __cnfn as_uint8(ushort16);

-uint8 __ovld __cnfn as_uint8(int8);

-uint8 __ovld __cnfn as_uint8(uint8);

-uint8 __ovld __cnfn as_uint8(long3);

-uint8 __ovld __cnfn as_uint8(long4);

-uint8 __ovld __cnfn as_uint8(ulong3);

-uint8 __ovld __cnfn as_uint8(ulong4);

-uint8 __ovld __cnfn as_uint8(float8);

-uint16 __ovld __cnfn as_uint16(int16);

-uint16 __ovld __cnfn as_uint16(uint16);

-uint16 __ovld __cnfn as_uint16(long8);

-uint16 __ovld __cnfn as_uint16(ulong8);

-uint16 __ovld __cnfn as_uint16(float16);

-long __ovld __cnfn as_long(char8);

-long __ovld __cnfn as_long(uchar8);

-long __ovld __cnfn as_long(short3);

-long __ovld __cnfn as_long(short4);

-long __ovld __cnfn as_long(ushort3);

-long __ovld __cnfn as_long(ushort4);

-long __ovld __cnfn as_long(int2);

-long __ovld __cnfn as_long(uint2);

-long __ovld __cnfn as_long(long);

-long __ovld __cnfn as_long(ulong);

-long __ovld __cnfn as_long(float2);

-long2 __ovld __cnfn as_long2(char16);

-long2 __ovld __cnfn as_long2(uchar16);

-long2 __ovld __cnfn as_long2(short8);

-long2 __ovld __cnfn as_long2(ushort8);

-long2 __ovld __cnfn as_long2(int3);

-long2 __ovld __cnfn as_long2(int4);

-long2 __ovld __cnfn as_long2(uint3);

-long2 __ovld __cnfn as_long2(uint4);

-long2 __ovld __cnfn as_long2(long2);

-long2 __ovld __cnfn as_long2(ulong2);

-long2 __ovld __cnfn as_long2(float3);

-long2 __ovld __cnfn as_long2(float4);

-long3 __ovld __cnfn as_long3(short16);

-long3 __ovld __cnfn as_long3(ushort16);

-long3 __ovld __cnfn as_long3(int8);

-long3 __ovld __cnfn as_long3(uint8);

-long3 __ovld __cnfn as_long3(long3);

-long3 __ovld __cnfn as_long3(long4);

-long3 __ovld __cnfn as_long3(ulong3);

-long3 __ovld __cnfn as_long3(ulong4);

-long3 __ovld __cnfn as_long3(float8);

-long4 __ovld __cnfn as_long4(short16);

-long4 __ovld __cnfn as_long4(ushort16);

-long4 __ovld __cnfn as_long4(int8);

-long4 __ovld __cnfn as_long4(uint8);

-long4 __ovld __cnfn as_long4(long3);

-long4 __ovld __cnfn as_long4(long4);

-long4 __ovld __cnfn as_long4(ulong3);

-long4 __ovld __cnfn as_long4(ulong4);

-long4 __ovld __cnfn as_long4(float8);

-long8 __ovld __cnfn as_long8(int16);

-long8 __ovld __cnfn as_long8(uint16);

-long8 __ovld __cnfn as_long8(long8);

-long8 __ovld __cnfn as_long8(ulong8);

-long8 __ovld __cnfn as_long8(float16);

-long16 __ovld __cnfn as_long16(long16);

-long16 __ovld __cnfn as_long16(ulong16);

-ulong __ovld __cnfn as_ulong(char8);

-ulong __ovld __cnfn as_ulong(uchar8);

-ulong __ovld __cnfn as_ulong(short3);

-ulong __ovld __cnfn as_ulong(short4);

-ulong __ovld __cnfn as_ulong(ushort3);

-ulong __ovld __cnfn as_ulong(ushort4);

-ulong __ovld __cnfn as_ulong(int2);

-ulong __ovld __cnfn as_ulong(uint2);

-ulong __ovld __cnfn as_ulong(long);

-ulong __ovld __cnfn as_ulong(ulong);

-ulong __ovld __cnfn as_ulong(float2);

-ulong2 __ovld __cnfn as_ulong2(char16);

-ulong2 __ovld __cnfn as_ulong2(uchar16);

-ulong2 __ovld __cnfn as_ulong2(short8);

-ulong2 __ovld __cnfn as_ulong2(ushort8);

-ulong2 __ovld __cnfn as_ulong2(int3);

-ulong2 __ovld __cnfn as_ulong2(int4);

-ulong2 __ovld __cnfn as_ulong2(uint3);

-ulong2 __ovld __cnfn as_ulong2(uint4);

-ulong2 __ovld __cnfn as_ulong2(long2);

-ulong2 __ovld __cnfn as_ulong2(ulong2);

-ulong2 __ovld __cnfn as_ulong2(float3);

-ulong2 __ovld __cnfn as_ulong2(float4);

-ulong3 __ovld __cnfn as_ulong3(short16);

-ulong3 __ovld __cnfn as_ulong3(ushort16);

-ulong3 __ovld __cnfn as_ulong3(int8);

-ulong3 __ovld __cnfn as_ulong3(uint8);

-ulong3 __ovld __cnfn as_ulong3(long3);

-ulong3 __ovld __cnfn as_ulong3(long4);

-ulong3 __ovld __cnfn as_ulong3(ulong3);

-ulong3 __ovld __cnfn as_ulong3(ulong4);

-ulong3 __ovld __cnfn as_ulong3(float8);

-ulong4 __ovld __cnfn as_ulong4(short16);

-ulong4 __ovld __cnfn as_ulong4(ushort16);

-ulong4 __ovld __cnfn as_ulong4(int8);

-ulong4 __ovld __cnfn as_ulong4(uint8);

-ulong4 __ovld __cnfn as_ulong4(long3);

-ulong4 __ovld __cnfn as_ulong4(long4);

-ulong4 __ovld __cnfn as_ulong4(ulong3);

-ulong4 __ovld __cnfn as_ulong4(ulong4);

-ulong4 __ovld __cnfn as_ulong4(float8);

-ulong8 __ovld __cnfn as_ulong8(int16);

-ulong8 __ovld __cnfn as_ulong8(uint16);

-ulong8 __ovld __cnfn as_ulong8(long8);

-ulong8 __ovld __cnfn as_ulong8(ulong8);

-ulong8 __ovld __cnfn as_ulong8(float16);

-ulong16 __ovld __cnfn as_ulong16(long16);

-ulong16 __ovld __cnfn as_ulong16(ulong16);

-float __ovld __cnfn as_float(char3);

-float __ovld __cnfn as_float(char4);

-float __ovld __cnfn as_float(uchar3);

-float __ovld __cnfn as_float(uchar4);

-float __ovld __cnfn as_float(short2);

-float __ovld __cnfn as_float(ushort2);

-float __ovld __cnfn as_float(int);

-float __ovld __cnfn as_float(uint);

-float __ovld __cnfn as_float(float);

-float2 __ovld __cnfn as_float2(char8);

-float2 __ovld __cnfn as_float2(uchar8);

-float2 __ovld __cnfn as_float2(short3);

-float2 __ovld __cnfn as_float2(short4);

-float2 __ovld __cnfn as_float2(ushort3);

-float2 __ovld __cnfn as_float2(ushort4);

-float2 __ovld __cnfn as_float2(int2);

-float2 __ovld __cnfn as_float2(uint2);

-float2 __ovld __cnfn as_float2(long);

-float2 __ovld __cnfn as_float2(ulong);

-float2 __ovld __cnfn as_float2(float2);

-float3 __ovld __cnfn as_float3(char16);

-float3 __ovld __cnfn as_float3(uchar16);

-float3 __ovld __cnfn as_float3(short8);

-float3 __ovld __cnfn as_float3(ushort8);

-float3 __ovld __cnfn as_float3(int3);

-float3 __ovld __cnfn as_float3(int4);

-float3 __ovld __cnfn as_float3(uint3);

-float3 __ovld __cnfn as_float3(uint4);

-float3 __ovld __cnfn as_float3(long2);

-float3 __ovld __cnfn as_float3(ulong2);

-float3 __ovld __cnfn as_float3(float3);

-float3 __ovld __cnfn as_float3(float4);

-float4 __ovld __cnfn as_float4(char16);

-float4 __ovld __cnfn as_float4(uchar16);

-float4 __ovld __cnfn as_float4(short8);

-float4 __ovld __cnfn as_float4(ushort8);

-float4 __ovld __cnfn as_float4(int3);

-float4 __ovld __cnfn as_float4(int4);

-float4 __ovld __cnfn as_float4(uint3);

-float4 __ovld __cnfn as_float4(uint4);

-float4 __ovld __cnfn as_float4(long2);

-float4 __ovld __cnfn as_float4(ulong2);

-float4 __ovld __cnfn as_float4(float3);

-float4 __ovld __cnfn as_float4(float4);

-float8 __ovld __cnfn as_float8(short16);

-float8 __ovld __cnfn as_float8(ushort16);

-float8 __ovld __cnfn as_float8(int8);

-float8 __ovld __cnfn as_float8(uint8);

-float8 __ovld __cnfn as_float8(long3);

-float8 __ovld __cnfn as_float8(long4);

-float8 __ovld __cnfn as_float8(ulong3);

-float8 __ovld __cnfn as_float8(ulong4);

-float8 __ovld __cnfn as_float8(float8);

-float16 __ovld __cnfn as_float16(int16);

-float16 __ovld __cnfn as_float16(uint16);

-float16 __ovld __cnfn as_float16(long8);

-float16 __ovld __cnfn as_float16(ulong8);

-float16 __ovld __cnfn as_float16(float16);

+#define as_char(x) __builtin_astype((x), char)

+#define as_char2(x) __builtin_astype((x), char2)

+#define as_char3(x) __builtin_astype((x), char3)

+#define as_char4(x) __builtin_astype((x), char4)

+#define as_char8(x) __builtin_astype((x), char8)

+#define as_char16(x) __builtin_astype((x), char16)

+#define as_uchar(x) __builtin_astype((x), uchar)

+#define as_uchar2(x) __builtin_astype((x), uchar2)

+#define as_uchar3(x) __builtin_astype((x), uchar3)

+#define as_uchar4(x) __builtin_astype((x), uchar4)

+#define as_uchar8(x) __builtin_astype((x), uchar8)

+#define as_uchar16(x) __builtin_astype((x), uchar16)

+#define as_short(x) __builtin_astype((x), short)

+#define as_short2(x) __builtin_astype((x), short2)

+#define as_short3(x) __builtin_astype((x), short3)

+#define as_short4(x) __builtin_astype((x), short4)

+#define as_short8(x) __builtin_astype((x), short8)

+#define as_short16(x) __builtin_astype((x), short16)

+#define as_ushort(x) __builtin_astype((x), ushort)

+#define as_ushort2(x) __builtin_astype((x), ushort2)

+#define as_ushort3(x) __builtin_astype((x), ushort3)

+#define as_ushort4(x) __builtin_astype((x), ushort4)

+#define as_ushort8(x) __builtin_astype((x), ushort8)

+#define as_ushort16(x) __builtin_astype((x), ushort16)

+#define as_int(x) __builtin_astype((x), int)

+#define as_int2(x) __builtin_astype((x), int2)

+#define as_int3(x) __builtin_astype((x), int3)

+#define as_int4(x) __builtin_astype((x), int4)

+#define as_int8(x) __builtin_astype((x), int8)

+#define as_int16(x) __builtin_astype((x), int16)

+#define as_uint(x) __builtin_astype((x), uint)

+#define as_uint2(x) __builtin_astype((x), uint2)

+#define as_uint3(x) __builtin_astype((x), uint3)

+#define as_uint4(x) __builtin_astype((x), uint4)

+#define as_uint8(x) __builtin_astype((x), uint8)

+#define as_uint16(x) __builtin_astype((x), uint16)

+#define as_long(x) __builtin_astype((x), long)

+#define as_long2(x) __builtin_astype((x), long2)

+#define as_long3(x) __builtin_astype((x), long3)

+#define as_long4(x) __builtin_astype((x), long4)

+#define as_long8(x) __builtin_astype((x), long8)

+#define as_long16(x) __builtin_astype((x), long16)

+#define as_ulong(x) __builtin_astype((x), ulong)

+#define as_ulong2(x) __builtin_astype((x), ulong2)

+#define as_ulong3(x) __builtin_astype((x), ulong3)

+#define as_ulong4(x) __builtin_astype((x), ulong4)

+#define as_ulong8(x) __builtin_astype((x), ulong8)

+#define as_ulong16(x) __builtin_astype((x), ulong16)

+#define as_float(x) __builtin_astype((x), float)

+#define as_float2(x) __builtin_astype((x), float2)

+#define as_float3(x) __builtin_astype((x), float3)

+#define as_float4(x) __builtin_astype((x), float4)

+#define as_float8(x) __builtin_astype((x), float8)

+#define as_float16(x) __builtin_astype((x), float16)

#ifdef cl_khr_fp64

-char8 __ovld __cnfn as_char8(double);

-char16 __ovld __cnfn as_char16(double2);

-uchar8 __ovld __cnfn as_uchar8(double);

-uchar16 __ovld __cnfn as_uchar16(double2);

-short3 __ovld __cnfn as_short3(double);

-short4 __ovld __cnfn as_short4(double);

-short8 __ovld __cnfn as_short8(double2);

-short16 __ovld __cnfn as_short16(double3);

-short16 __ovld __cnfn as_short16(double4);

-ushort3 __ovld __cnfn as_ushort3(double);

-ushort4 __ovld __cnfn as_ushort4(double);

-ushort8 __ovld __cnfn as_ushort8(double2);

-ushort16 __ovld __cnfn as_ushort16(double3);

-ushort16 __ovld __cnfn as_ushort16(double4);

-int2 __ovld __cnfn as_int2(double);

-int3 __ovld __cnfn as_int3(double2);

-int4 __ovld __cnfn as_int4(double2);

-int8 __ovld __cnfn as_int8(double3);

-int8 __ovld __cnfn as_int8(double4);

-int16 __ovld __cnfn as_int16(double8);

-uint2 __ovld __cnfn as_uint2(double);

-uint3 __ovld __cnfn as_uint3(double2);

-uint4 __ovld __cnfn as_uint4(double2);

-uint8 __ovld __cnfn as_uint8(double3);

-uint8 __ovld __cnfn as_uint8(double4);

-uint16 __ovld __cnfn as_uint16(double8);

-long __ovld __cnfn as_long(double);

-long2 __ovld __cnfn as_long2(double2);

-long3 __ovld __cnfn as_long3(double3);

-long3 __ovld __cnfn as_long3(double4);

-long4 __ovld __cnfn as_long4(double3);

-long4 __ovld __cnfn as_long4(double4);

-long8 __ovld __cnfn as_long8(double8);

-long16 __ovld __cnfn as_long16(double16);

-ulong __ovld __cnfn as_ulong(double);

-ulong2 __ovld __cnfn as_ulong2(double2);

-ulong3 __ovld __cnfn as_ulong3(double3);

-ulong3 __ovld __cnfn as_ulong3(double4);

-ulong4 __ovld __cnfn as_ulong4(double3);

-ulong4 __ovld __cnfn as_ulong4(double4);

-ulong8 __ovld __cnfn as_ulong8(double8);

-ulong16 __ovld __cnfn as_ulong16(double16);

-float2 __ovld __cnfn as_float2(double);

-float3 __ovld __cnfn as_float3(double2);

-float4 __ovld __cnfn as_float4(double2);

-float8 __ovld __cnfn as_float8(double3);

-float8 __ovld __cnfn as_float8(double4);

-float16 __ovld __cnfn as_float16(double8);

-double __ovld __cnfn as_double(char8);

-double __ovld __cnfn as_double(uchar8);

-double __ovld __cnfn as_double(short3);

-double __ovld __cnfn as_double(short4);

-double __ovld __cnfn as_double(ushort3);

-double __ovld __cnfn as_double(ushort4);

-double __ovld __cnfn as_double(int2);

-double __ovld __cnfn as_double(uint2);

-double __ovld __cnfn as_double(long);

-double __ovld __cnfn as_double(ulong);

-double __ovld __cnfn as_double(float2);

-double __ovld __cnfn as_double(double);

-double2 __ovld __cnfn as_double2(char16);

-double2 __ovld __cnfn as_double2(uchar16);

-double2 __ovld __cnfn as_double2(short8);

-double2 __ovld __cnfn as_double2(ushort8);

-double2 __ovld __cnfn as_double2(int3);

-double2 __ovld __cnfn as_double2(int4);

-double2 __ovld __cnfn as_double2(uint3);

-double2 __ovld __cnfn as_double2(uint4);

-double2 __ovld __cnfn as_double2(long2);

-double2 __ovld __cnfn as_double2(ulong2);

-double2 __ovld __cnfn as_double2(float3);

-double2 __ovld __cnfn as_double2(float4);

-double2 __ovld __cnfn as_double2(double2);

-double3 __ovld __cnfn as_double3(short16);

-double3 __ovld __cnfn as_double3(ushort16);

-double3 __ovld __cnfn as_double3(int8);

-double3 __ovld __cnfn as_double3(uint8);

-double3 __ovld __cnfn as_double3(long3);

-double3 __ovld __cnfn as_double3(long4);

-double3 __ovld __cnfn as_double3(ulong3);

-double3 __ovld __cnfn as_double3(ulong4);

-double3 __ovld __cnfn as_double3(float8);

-double3 __ovld __cnfn as_double3(double3);

-double3 __ovld __cnfn as_double3(double4);

-double4 __ovld __cnfn as_double4(short16);

-double4 __ovld __cnfn as_double4(ushort16);

-double4 __ovld __cnfn as_double4(int8);

-double4 __ovld __cnfn as_double4(uint8);

-double4 __ovld __cnfn as_double4(long3);

-double4 __ovld __cnfn as_double4(long4);

-double4 __ovld __cnfn as_double4(ulong3);

-double4 __ovld __cnfn as_double4(ulong4);

-double4 __ovld __cnfn as_double4(float8);

-double4 __ovld __cnfn as_double4(double3);

-double4 __ovld __cnfn as_double4(double4);

-double8 __ovld __cnfn as_double8(int16);

-double8 __ovld __cnfn as_double8(uint16);

-double8 __ovld __cnfn as_double8(long8);

-double8 __ovld __cnfn as_double8(ulong8);

-double8 __ovld __cnfn as_double8(float16);

-double8 __ovld __cnfn as_double8(double8);

-double16 __ovld __cnfn as_double16(long16);

-double16 __ovld __cnfn as_double16(ulong16);

-double16 __ovld __cnfn as_double16(double16);

+#define as_double(x) __builtin_astype((x), double)

+#define as_double2(x) __builtin_astype((x), double2)

+#define as_double3(x) __builtin_astype((x), double3)

+#define as_double4(x) __builtin_astype((x), double4)

+#define as_double8(x) __builtin_astype((x), double8)

+#define as_double16(x) __builtin_astype((x), double16)

#endif //cl_khr_fp64

#ifdef cl_khr_fp16

-char2 __ovld __cnfn as_char2(half);

-char3 __ovld __cnfn as_char3(half2);

-char4 __ovld __cnfn as_char4(half2);

-char8 __ovld __cnfn as_char8(half3);

-char8 __ovld __cnfn as_char8(half4);

-char16 __ovld __cnfn as_char16(half8);

-uchar2 __ovld __cnfn as_uchar2(half);

-uchar3 __ovld __cnfn as_uchar3(half2);

-uchar4 __ovld __cnfn as_uchar4(half2);

-uchar8 __ovld __cnfn as_uchar8(half3);

-uchar8 __ovld __cnfn as_uchar8(half4);

-uchar16 __ovld __cnfn as_uchar16(half8);

-short __ovld __cnfn as_short(half);

-short2 __ovld __cnfn as_short2(half2);

-short3 __ovld __cnfn as_short3(half3);

-short3 __ovld __cnfn as_short3(half4);

-short4 __ovld __cnfn as_short4(half3);

-short4 __ovld __cnfn as_short4(half4);

-short8 __ovld __cnfn as_short8(half8);

-short16 __ovld __cnfn as_short16(half16);

-ushort __ovld __cnfn as_ushort(half);

-ushort2 __ovld __cnfn as_ushort2(half2);

-ushort3 __ovld __cnfn as_ushort3(half3);

-ushort3 __ovld __cnfn as_ushort3(half4);

-ushort4 __ovld __cnfn as_ushort4(half3);

-ushort4 __ovld __cnfn as_ushort4(half4);

-ushort8 __ovld __cnfn as_ushort8(half8);

-ushort16 __ovld __cnfn as_ushort16(half16);

-int __ovld __cnfn as_int(half2);

-int2 __ovld __cnfn as_int2(half3);

-int2 __ovld __cnfn as_int2(half4);

-int3 __ovld __cnfn as_int3(half8);

-int4 __ovld __cnfn as_int4(half8);

-int8 __ovld __cnfn as_int8(half16);

-uint __ovld __cnfn as_uint(half2);

-uint2 __ovld __cnfn as_uint2(half3);

-uint2 __ovld __cnfn as_uint2(half4);

-uint3 __ovld __cnfn as_uint3(half8);

-uint4 __ovld __cnfn as_uint4(half8);

-uint8 __ovld __cnfn as_uint8(half16);

-long __ovld __cnfn as_long(half3);

-long __ovld __cnfn as_long(half4);

-long2 __ovld __cnfn as_long2(half8);

-long3 __ovld __cnfn as_long3(half16);

-long4 __ovld __cnfn as_long4(half16);

-ulong __ovld __cnfn as_ulong(half3);

-ulong __ovld __cnfn as_ulong(half4);

-ulong2 __ovld __cnfn as_ulong2(half8);

-ulong3 __ovld __cnfn as_ulong3(half16);

-ulong4 __ovld __cnfn as_ulong4(half16);

-half __ovld __cnfn as_half(char2);

-half __ovld __cnfn as_half(uchar2);

-half __ovld __cnfn as_half(short);

-half __ovld __cnfn as_half(ushort);

-half __ovld __cnfn as_half(half);

-half2 __ovld __cnfn as_half2(char3);

-half2 __ovld __cnfn as_half2(char4);

-half2 __ovld __cnfn as_half2(uchar3);

-half2 __ovld __cnfn as_half2(uchar4);

-half2 __ovld __cnfn as_half2(short2);

-half2 __ovld __cnfn as_half2(ushort2);

-half2 __ovld __cnfn as_half2(int);

-half2 __ovld __cnfn as_half2(uint);

-half2 __ovld __cnfn as_half2(half2);

-half2 __ovld __cnfn as_half2(float);

-half3 __ovld __cnfn as_half3(char8);

-half3 __ovld __cnfn as_half3(uchar8);

-half3 __ovld __cnfn as_half3(short3);

-half3 __ovld __cnfn as_half3(short4);

-half3 __ovld __cnfn as_half3(ushort3);

-half3 __ovld __cnfn as_half3(ushort4);

-half3 __ovld __cnfn as_half3(int2);

-half3 __ovld __cnfn as_half3(uint2);

-half3 __ovld __cnfn as_half3(long);

-half3 __ovld __cnfn as_half3(ulong);

-half3 __ovld __cnfn as_half3(half3);

-half3 __ovld __cnfn as_half3(half4);

-half3 __ovld __cnfn as_half3(float2);

-half4 __ovld __cnfn as_half4(char8);

-half4 __ovld __cnfn as_half4(uchar8);

-half4 __ovld __cnfn as_half4(short3);

-half4 __ovld __cnfn as_half4(short4);

-half4 __ovld __cnfn as_half4(ushort3);

-half4 __ovld __cnfn as_half4(ushort4);

-half4 __ovld __cnfn as_half4(int2);

-half4 __ovld __cnfn as_half4(uint2);

-half4 __ovld __cnfn as_half4(long);

-half4 __ovld __cnfn as_half4(ulong);

-half4 __ovld __cnfn as_half4(half3);

-half4 __ovld __cnfn as_half4(half4);

-half4 __ovld __cnfn as_half4(float2);

-half8 __ovld __cnfn as_half8(char16);

-half8 __ovld __cnfn as_half8(uchar16);

-half8 __ovld __cnfn as_half8(short8);

-half8 __ovld __cnfn as_half8(ushort8);

-half8 __ovld __cnfn as_half8(int3);

-half8 __ovld __cnfn as_half8(int4);

-half8 __ovld __cnfn as_half8(uint3);

-half8 __ovld __cnfn as_half8(uint4);

-half8 __ovld __cnfn as_half8(long2);

-half8 __ovld __cnfn as_half8(ulong2);

-half8 __ovld __cnfn as_half8(half8);

-half8 __ovld __cnfn as_half8(float3);

-half8 __ovld __cnfn as_half8(float4);

-half16 __ovld __cnfn as_half16(short16);

-half16 __ovld __cnfn as_half16(ushort16);

-half16 __ovld __cnfn as_half16(int8);

-half16 __ovld __cnfn as_half16(uint8);

-half16 __ovld __cnfn as_half16(long3);

-half16 __ovld __cnfn as_half16(long4);

-half16 __ovld __cnfn as_half16(ulong3);

-half16 __ovld __cnfn as_half16(ulong4);

-half16 __ovld __cnfn as_half16(half16);

-half16 __ovld __cnfn as_half16(float8);

-float __ovld __cnfn as_float(half2);

-float2 __ovld __cnfn as_float2(half3);

-float2 __ovld __cnfn as_float2(half4);

-float3 __ovld __cnfn as_float3(half8);

-float4 __ovld __cnfn as_float4(half8);

-float8 __ovld __cnfn as_float8(half16);

-#ifdef cl_khr_fp64

-half3 __ovld __cnfn as_half3(double);

-half4 __ovld __cnfn as_half4(double);

-half8 __ovld __cnfn as_half8(double2);

-half16 __ovld __cnfn as_half16(double3);

-half16 __ovld __cnfn as_half16(double4);

-double __ovld __cnfn as_double(half3);

-double __ovld __cnfn as_double(half4);

-double2 __ovld __cnfn as_double2(half8);

-double3 __ovld __cnfn as_double3(half16);

-double4 __ovld __cnfn as_double4(half16);

-#endif //cl_khr_fp64

+#define as_half(x) __builtin_astype((x), half)

+#define as_half2(x) __builtin_astype((x), half2)

+#define as_half3(x) __builtin_astype((x), half3)

+#define as_half4(x) __builtin_astype((x), half4)

+#define as_half8(x) __builtin_astype((x), half8)

+#define as_half16(x) __builtin_astype((x), half16)

#endif //cl_khr_fp16

// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers

@@ -14389,10 +13703,10 @@ float __ovld atomic_xchg(volatile __local float *p, float val);

#if defined(cl_khr_global_int32_base_atomics)

int __ovld atom_xchg(volatile __global int *p, int val);

-int __ovld atom_xchg(volatile __local int *p, int val);

+unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);

#endif

#if defined(cl_khr_local_int32_base_atomics)

-unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);

+int __ovld atom_xchg(volatile __local int *p, int val);

unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val);

#endif

@@ -14509,8 +13823,6 @@ unsigned int __ovld atom_min(volatile __local unsigned int *p, unsigned int val)

#if defined(cl_khr_int64_extended_atomics)

long __ovld atom_min(volatile __global long *p, long val);

unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val);

-#endif

-#if defined(cl_khr_local_int32_extended_atomics)

long __ovld atom_min(volatile __local long *p, long val);

unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val);

#endif

@@ -15995,9 +15307,11 @@ void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, flo

void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color);

void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color);

+#ifdef cl_khr_3d_image_writes

void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color);

void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color);

void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color);

+#endif

#ifdef cl_khr_depth_images

void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, float color);

@@ -16025,16 +15339,20 @@ void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, in

void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);

void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);

+#ifdef cl_khr_3d_image_writes

void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);

void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);

void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);

+#endif

#endif //cl_khr_mipmap_image

// Image write functions for half4 type

#ifdef cl_khr_fp16

void __ovld write_imageh(write_only image1d_t image, int coord, half4 color);

void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color);

+#ifdef cl_khr_3d_image_writes

void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color);

+#endif

void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color);

void __ovld write_imageh(write_only image2d_array_t image, int4 coord, half4 color);

void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 color);

@@ -16062,9 +15380,11 @@ void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, flo

void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color);

void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color);

+#ifdef cl_khr_3d_image_writes

void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color);

void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color);

void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color);

+#endif

#ifdef cl_khr_depth_images

void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float color);

@@ -16091,16 +15411,20 @@ void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, in

void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color);

void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color);

+#ifdef cl_khr_3d_image_writes

void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);

void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);

void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);

+#endif

#endif //cl_khr_mipmap_image

// Image write functions for half4 type

#ifdef cl_khr_fp16

void __ovld write_imageh(read_write image1d_t image, int coord, half4 color);

void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color);

+#ifdef cl_khr_3d_image_writes

void __ovld write_imageh(read_write image3d_t image, int4 coord, half4 color);

+#endif

void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 color);

void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color);

void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color);

@@ -16118,7 +15442,9 @@ void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 col

int __ovld __cnfn get_image_width(read_only image1d_t image);

int __ovld __cnfn get_image_width(read_only image1d_buffer_t image);

int __ovld __cnfn get_image_width(read_only image2d_t image);

+#ifdef cl_khr_3d_image_writes

int __ovld __cnfn get_image_width(read_only image3d_t image);

+#endif

int __ovld __cnfn get_image_width(read_only image1d_array_t image);

int __ovld __cnfn get_image_width(read_only image2d_array_t image);

#ifdef cl_khr_depth_images

@@ -16135,7 +15461,9 @@ int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image);

int __ovld __cnfn get_image_width(write_only image1d_t image);

int __ovld __cnfn get_image_width(write_only image1d_buffer_t image);

int __ovld __cnfn get_image_width(write_only image2d_t image);

+#ifdef cl_khr_3d_image_writes

int __ovld __cnfn get_image_width(write_only image3d_t image);

+#endif

int __ovld __cnfn get_image_width(write_only image1d_array_t image);

int __ovld __cnfn get_image_width(write_only image2d_array_t image);

#ifdef cl_khr_depth_images

@@ -16186,7 +15514,9 @@ int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image);

#endif //cl_khr_gl_msaa_sharing

int __ovld __cnfn get_image_height(write_only image2d_t image);

+#ifdef cl_khr_3d_image_writes

int __ovld __cnfn get_image_height(write_only image3d_t image);

+#endif

int __ovld __cnfn get_image_height(write_only image2d_array_t image);

#ifdef cl_khr_depth_images

int __ovld __cnfn get_image_height(write_only image2d_depth_t image);

@@ -16220,7 +15550,9 @@ int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image);

int __ovld __cnfn get_image_depth(read_only image3d_t image);

+#ifdef cl_khr_3d_image_writes

int __ovld __cnfn get_image_depth(write_only image3d_t image);

+#endif

#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0

int __ovld __cnfn get_image_depth(read_write image3d_t image);

@@ -16238,7 +15570,9 @@ int __ovld get_image_num_mip_levels(read_only image3d_t image);

int __ovld get_image_num_mip_levels(write_only image1d_t image);

int __ovld get_image_num_mip_levels(write_only image2d_t image);

+#ifdef cl_khr_3d_image_writes

int __ovld get_image_num_mip_levels(write_only image3d_t image);

+#endif

#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0

int __ovld get_image_num_mip_levels(read_write image1d_t image);

@@ -16324,7 +15658,9 @@ int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth

int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image);

int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image);

int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image);

+#ifdef cl_khr_3d_image_writes

int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image);

+#endif

int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image);

int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_t image);

#ifdef cl_khr_depth_images

@@ -16418,7 +15754,9 @@ int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t i

int __ovld __cnfn get_image_channel_order(write_only image1d_t image);

int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image);

int __ovld __cnfn get_image_channel_order(write_only image2d_t image);

+#ifdef cl_khr_3d_image_writes

int __ovld __cnfn get_image_channel_order(write_only image3d_t image);

+#endif

int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image);

int __ovld __cnfn get_image_channel_order(write_only image2d_array_t image);

#ifdef cl_khr_depth_images

@@ -16504,7 +15842,9 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image);

* component and the w component is 0.

int4 __ovld __cnfn get_image_dim(read_only image3d_t image);

+#ifdef cl_khr_3d_image_writes

int4 __ovld __cnfn get_image_dim(write_only image3d_t image);

+#endif

#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0

int4 __ovld __cnfn get_image_dim(read_write image3d_t image);

#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0

@@ -16714,16 +16054,12 @@ typedef int clk_profiling_info;

#define MAX_WORK_DIM 3

-// ToDo: Remove definition of ndrange_t in Clang as an opaque type and add back

-// the following ndrange_t definition.

-#if 0

typedef struct {

unsigned int workDimension;

size_t globalWorkOffset[MAX_WORK_DIM];

size_t globalWorkSize[MAX_WORK_DIM];

size_t localWorkSize[MAX_WORK_DIM];

} ndrange_t;

-#endif

ndrange_t __ovld ndrange_1D(size_t);

ndrange_t __ovld ndrange_1D(size_t, size_t);

diff --git a/lib/Headers/pmmintrin.h b/lib/Headers/pmmintrin.h
index d4f6487af179..a479d9ed2911 100644
--- a/lib/Headers/pmmintrin.h
+++ b/lib/Headers/pmmintrin.h

@@ -115,7 +115,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b)

/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit

/// vector of [4 x float] to float values stored in a 128-bit vector of

-/// [4 x float].

+/// [4 x float].

///

/// \headerfile <x86intrin.h>

///

@@ -136,7 +136,7 @@ _mm_movehdup_ps(__m128 __a)

}

/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of

-/// [4 x float] to float values stored in a 128-bit vector of [4 x float].

+/// [4 x float] to float values stored in a 128-bit vector of [4 x float].

///

/// \headerfile <x86intrin.h>

///

@@ -257,14 +257,6 @@ _mm_movedup_pd(__m128d __a)

return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);

}

-#define _MM_DENORMALS_ZERO_ON (0x0040)

-#define _MM_DENORMALS_ZERO_OFF (0x0000)

-#define _MM_DENORMALS_ZERO_MASK (0x0040)

-#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)

-#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))

/// \brief Establishes a linear address memory range to be monitored and puts

/// the processor in the monitor event pending state. Data stored in the

/// monitored address range causes the processor to exit the pending state.

diff --git a/lib/Headers/prfchwintrin.h b/lib/Headers/prfchwintrin.h
index ba0285751823..a3789126ef07 100644
--- a/lib/Headers/prfchwintrin.h
+++ b/lib/Headers/prfchwintrin.h

@@ -29,12 +29,36 @@

#define __PRFCHWINTRIN_H

#if defined(__PRFCHW__) || defined(__3dNOW__)

+/// \brief Loads a memory sequence containing the specified memory address into

+/// all data cache levels. The cache-coherency state is set to exclusive.

+/// Data can be read from and written to the cache line without additional

+/// delay.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the \c PREFETCHT0 instruction.

+///

+/// \param __P

+/// A pointer specifying the memory address to be prefetched.

static __inline__ void __attribute__((__always_inline__, __nodebug__))

_m_prefetch(void *__P)

{

__builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);

}

+/// \brief Loads a memory sequence containing the specified memory address into

+/// the L1 data cache and sets the cache-coherency to modified. This

+/// provides a hint to the processor that the cache line will be modified.

+/// It is intended for use when the cache line will be written to shortly

+/// after the prefetch is performed. Note that the effect of this intrinsic

+/// is dependent on the processor implementation.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the \c PREFETCHW instruction.

+///

+/// \param __P

+/// A pointer specifying the memory address to be prefetched.

static __inline__ void __attribute__((__always_inline__, __nodebug__))

_m_prefetchw(void *__P)

{

diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h
index e48ab034f46f..dccba4e40b2d 100644
--- a/lib/Headers/smmintrin.h
+++ b/lib/Headers/smmintrin.h

@@ -46,37 +46,394 @@

#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)

#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)

+/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an

+/// integer and returns the rounded values in a 128-bit vector of

+/// [4 x float].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_ceil_ps(__m128 X);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [4 x float] values to be rounded up.

+/// \returns A 128-bit vector of [4 x float] containing the rounded values.

#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)

+/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an

+/// integer and returns the rounded values in a 128-bit vector of

+/// [2 x double].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128d _mm_ceil_pd(__m128d X);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [2 x double] values to be rounded up.

+/// \returns A 128-bit vector of [2 x double] containing the rounded values.

#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)

+/// \brief Copies three upper elements of the first 128-bit vector operand to

+/// the corresponding three upper elements of the 128-bit result vector of

+/// [4 x float]. Rounds up the lowest element of the second 128-bit vector

+/// operand to an integer and copies it to the lowest element of the 128-bit

+/// result vector of [4 x float].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are

+/// copied to the corresponding bits of the result.

+/// \param Y

+/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is

+/// rounded up to the nearest integer and copied to the corresponding bits

+/// of the result.

+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded

+/// values.

#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)

+/// \brief Copies the upper element of the first 128-bit vector operand to the

+/// corresponding upper element of the 128-bit result vector of [2 x double].

+/// Rounds up the lower element of the second 128-bit vector operand to an

+/// integer and copies it to the lower element of the 128-bit result vector

+/// of [2 x double].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is

+/// copied to the corresponding bits of the result.

+/// \param Y

+/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is

+/// rounded up to the nearest integer and copied to the corresponding bits

+/// of the result.

+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded

+/// values.

#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)

+/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an

+/// an integer and returns the rounded values in a 128-bit vector of

+/// [4 x float].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_floor_ps(__m128 X);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [4 x float] values to be rounded down.

+/// \returns A 128-bit vector of [4 x float] containing the rounded values.

#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)

+/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an

+/// integer and returns the rounded values in a 128-bit vector of

+/// [2 x double].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128d _mm_floor_pd(__m128d X);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [2 x double].

+/// \returns A 128-bit vector of [2 x double] containing the rounded values.

#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)

+/// \brief Copies three upper elements of the first 128-bit vector operand to

+/// the corresponding three upper elements of the 128-bit result vector of

+/// [4 x float]. Rounds down the lowest element of the second 128-bit vector

+/// operand to an integer and copies it to the lowest element of the 128-bit

+/// result vector of [4 x float].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_floor_ss(__m128 X, __m128 Y);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are

+/// copied to the corresponding bits of the result.

+/// \param Y

+/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is

+/// rounded down to the nearest integer and copied to the corresponding bits

+/// of the result.

+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded

+/// values.

#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)

+/// \brief Copies the upper element of the first 128-bit vector operand to the

+/// corresponding upper element of the 128-bit result vector of [2 x double].

+/// Rounds down the lower element of the second 128-bit vector operand to an

+/// integer and copies it to the lower element of the 128-bit result vector

+/// of [2 x double].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128d _mm_floor_sd(__m128d X, __m128d Y);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is

+/// copied to the corresponding bits of the result.

+/// \param Y

+/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is

+/// rounded down to the nearest integer and copied to the corresponding bits

+/// of the result.

+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded

+/// values.

#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)

+/// \brief Rounds each element of the 128-bit vector of [4 x float] to an

+/// integer value according to the rounding control specified by the second

+/// argument and returns the rounded values in a 128-bit vector of

+/// [4 x float].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_round_ps(__m128 X, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [4 x float].

+/// \param M

+/// An integer value that specifies the rounding operation. \n

+/// Bits [7:4] are reserved. \n

+/// Bit [3] is a precision exception value: \n

+/// 0: A normal PE exception is used \n

+/// 1: The PE field is not updated \n

+/// Bit [2] is the rounding control source: \n

+/// 0: Use bits [1:0] of \a M \n

+/// 1: Use the current MXCSR setting \n

+/// Bits [1:0] contain the rounding control definition: \n

+/// 00: Nearest \n

+/// 01: Downward (toward negative infinity) \n

+/// 10: Upward (toward positive infinity) \n

+/// 11: Truncated

+/// \returns A 128-bit vector of [4 x float] containing the rounded values.

#define _mm_round_ps(X, M) __extension__ ({ \

(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })

+/// \brief Copies three upper elements of the first 128-bit vector operand to

+/// the corresponding three upper elements of the 128-bit result vector of

+/// [4 x float]. Rounds the lowest element of the second 128-bit vector

+/// operand to an integer value according to the rounding control specified

+/// by the third argument and copies it to the lowest element of the 128-bit

+/// result vector of [4 x float].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are

+/// copied to the corresponding bits of the result.

+/// \param Y

+/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is

+/// rounded to the nearest integer using the specified rounding control and

+/// copied to the corresponding bits of the result.

+/// \param M

+/// An integer value that specifies the rounding operation. \n

+/// Bits [7:4] are reserved. \n

+/// Bit [3] is a precision exception value: \n

+/// 0: A normal PE exception is used \n

+/// 1: The PE field is not updated \n

+/// Bit [2] is the rounding control source: \n

+/// 0: Use bits [1:0] of \a M \n

+/// 1: Use the current MXCSR setting \n

+/// Bits [1:0] contain the rounding control definition: \n

+/// 00: Nearest \n

+/// 01: Downward (toward negative infinity) \n

+/// 10: Upward (toward positive infinity) \n

+/// 11: Truncated

+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded

+/// values.

#define _mm_round_ss(X, Y, M) __extension__ ({ \

(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \

(__v4sf)(__m128)(Y), (M)); })

+/// \brief Rounds each element of the 128-bit vector of [2 x double] to an

+/// integer value according to the rounding control specified by the second

+/// argument and returns the rounded values in a 128-bit vector of

+/// [2 x double].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128d _mm_round_pd(__m128d X, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [2 x double].

+/// \param M

+/// An integer value that specifies the rounding operation. \n

+/// Bits [7:4] are reserved. \n

+/// Bit [3] is a precision exception value: \n

+/// 0: A normal PE exception is used \n

+/// 1: The PE field is not updated \n

+/// Bit [2] is the rounding control source: \n

+/// 0: Use bits [1:0] of \a M \n

+/// 1: Use the current MXCSR setting \n

+/// Bits [1:0] contain the rounding control definition: \n

+/// 00: Nearest \n

+/// 01: Downward (toward negative infinity) \n

+/// 10: Upward (toward positive infinity) \n

+/// 11: Truncated

+/// \returns A 128-bit vector of [2 x double] containing the rounded values.

#define _mm_round_pd(X, M) __extension__ ({ \

(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })

+/// \brief Copies the upper element of the first 128-bit vector operand to the

+/// corresponding upper element of the 128-bit result vector of [2 x double].

+/// Rounds the lower element of the second 128-bit vector operand to an

+/// integer value according to the rounding control specified by the third

+/// argument and copies it to the lower element of the 128-bit result vector

+/// of [2 x double].

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is

+/// copied to the corresponding bits of the result.

+/// \param Y

+/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is

+/// rounded to the nearest integer using the specified rounding control and

+/// copied to the corresponding bits of the result.

+/// \param M

+/// An integer value that specifies the rounding operation. \n

+/// Bits [7:4] are reserved. \n

+/// Bit [3] is a precision exception value: \n

+/// 0: A normal PE exception is used \n

+/// 1: The PE field is not updated \n

+/// Bit [2] is the rounding control source: \n

+/// 0: Use bits [1:0] of \a M \n

+/// 1: Use the current MXCSR setting \n

+/// Bits [1:0] contain the rounding control definition: \n

+/// 00: Nearest \n

+/// 01: Downward (toward negative infinity) \n

+/// 10: Upward (toward positive infinity) \n

+/// 11: Truncated

+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded

+/// values.

#define _mm_round_sd(X, Y, M) __extension__ ({ \

(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \

(__v2df)(__m128d)(Y), (M)); })

/* SSE4 Packed Blending Intrinsics. */

+/// \brief Returns a 128-bit vector of [2 x double] where the values are

+/// selected from either the first or second operand as specified by the

+/// third operand, the control mask.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c>

+/// instruction.

+///

+/// \param V1

+/// A 128-bit vector of [2 x double].

+/// \param V2

+/// A 128-bit vector of [2 x double].

+/// \param M

+/// An immediate integer operand, with mask bits [1:0] specifying how the

+/// values are to be copied. The position of the mask bit corresponds to the

+/// index of a copied value. When a mask bit is 0, the corresponding 64-bit

+/// element in operand \a V1 is copied to the same position in the result.

+/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2

+/// is copied to the same position in the result.

+/// \returns A 128-bit vector of [2 x double] containing the copied values.

#define _mm_blend_pd(V1, V2, M) __extension__ ({ \

(__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \

(__v2df)(__m128d)(V2), \

(((M) & 0x01) ? 2 : 0), \

(((M) & 0x02) ? 3 : 1)); })

+/// \brief Returns a 128-bit vector of [4 x float] where the values are selected

+/// from either the first or second operand as specified by the third

+/// operand, the control mask.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c>

+/// instruction.

+///

+/// \param V1

+/// A 128-bit vector of [4 x float].

+/// \param V2

+/// A 128-bit vector of [4 x float].

+/// \param M

+/// An immediate integer operand, with mask bits [3:0] specifying how the

+/// values are to be copied. The position of the mask bit corresponds to the

+/// index of a copied value. When a mask bit is 0, the corresponding 32-bit

+/// element in operand \a V1 is copied to the same position in the result.

+/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2

+/// is copied to the same position in the result.

+/// \returns A 128-bit vector of [4 x float] containing the copied values.

#define _mm_blend_ps(V1, V2, M) __extension__ ({ \

(__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \

(((M) & 0x01) ? 4 : 0), \

@@ -84,6 +441,27 @@

(((M) & 0x04) ? 6 : 2), \

(((M) & 0x08) ? 7 : 3)); })

+/// \brief Returns a 128-bit vector of [2 x double] where the values are

+/// selected from either the first or second operand as specified by the

+/// third operand, the control mask.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [2 x double].

+/// \param __V2

+/// A 128-bit vector of [2 x double].

+/// \param __M

+/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the

+/// values are to be copied. The position of the mask bit corresponds to the

+/// most significant bit of a copied value. When a mask bit is 0, the

+/// corresponding 64-bit element in operand \a __V1 is copied to the same

+/// position in the result. When a mask bit is 1, the corresponding 64-bit

+/// element in operand \a __V2 is copied to the same position in the result.

+/// \returns A 128-bit vector of [2 x double] containing the copied values.

static __inline__ __m128d __DEFAULT_FN_ATTRS

_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)

{

@@ -91,6 +469,27 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)

(__v2df)__M);

}

+/// \brief Returns a 128-bit vector of [4 x float] where the values are

+/// selected from either the first or second operand as specified by the

+/// third operand, the control mask.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [4 x float].

+/// \param __V2

+/// A 128-bit vector of [4 x float].

+/// \param __M

+/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying

+/// how the values are to be copied. The position of the mask bit corresponds

+/// to the most significant bit of a copied value. When a mask bit is 0, the

+/// corresponding 32-bit element in operand \a __V1 is copied to the same

+/// position in the result. When a mask bit is 1, the corresponding 32-bit

+/// element in operand \a __V2 is copied to the same position in the result.

+/// \returns A 128-bit vector of [4 x float] containing the copied values.

static __inline__ __m128 __DEFAULT_FN_ATTRS

_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)

{

@@ -98,6 +497,27 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)

(__v4sf)__M);

}

+/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected

+/// from either of the first or second operand as specified by the third

+/// operand, the control mask.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [16 x i8].

+/// \param __V2

+/// A 128-bit vector of [16 x i8].

+/// \param __M

+/// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying

+/// how the values are to be copied. The position of the mask bit corresponds

+/// to the most significant bit of a copied value. When a mask bit is 0, the

+/// corresponding 8-bit element in operand \a __V1 is copied to the same

+/// position in the result. When a mask bit is 1, the corresponding 8-bit

+/// element in operand \a __V2 is copied to the same position in the result.

+/// \returns A 128-bit vector of [16 x i8] containing the copied values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)

{

@@ -105,6 +525,31 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)

(__v16qi)__M);

}

+/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected

+/// from either of the first or second operand as specified by the third

+/// operand, the control mask.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c>

+/// instruction.

+///

+/// \param V1

+/// A 128-bit vector of [8 x i16].

+/// \param V2

+/// A 128-bit vector of [8 x i16].

+/// \param M

+/// An immediate integer operand, with mask bits [7:0] specifying how the

+/// values are to be copied. The position of the mask bit corresponds to the

+/// index of a copied value. When a mask bit is 0, the corresponding 16-bit

+/// element in operand \a V1 is copied to the same position in the result.

+/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2

+/// is copied to the same position in the result.

+/// \returns A 128-bit vector of [8 x i16] containing the copied values.

#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \

(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \

(__v8hi)(__m128i)(V2), \

@@ -118,12 +563,41 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)

(((M) & 0x80) ? 15 : 7)); })

/* SSE4 Dword Multiply Instructions. */

+/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32]

+/// and returns the lower 32 bits of the each product in a 128-bit vector of

+/// [4 x i32].

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit integer vector.

+/// \param __V2

+/// A 128-bit integer vector.

+/// \returns A 128-bit integer vector containing the products of both operands.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_mullo_epi32 (__m128i __V1, __m128i __V2)

{

return (__m128i) ((__v4su)__V1 * (__v4su)__V2);

}

+/// \brief Multiplies corresponding even-indexed elements of two 128-bit

+/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]

+/// containing the products.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [4 x i32].

+/// \param __V2

+/// A 128-bit vector of [4 x i32].

+/// \returns A 128-bit vector of [2 x i64] containing the products of both

+/// operands.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_mul_epi32 (__m128i __V1, __m128i __V2)

{

@@ -131,64 +605,250 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)

}

/* SSE4 Floating Point Dot Product Instructions. */

+/// \brief Computes the dot product of the two 128-bit vectors of [4 x float]

+/// and returns it in the elements of the 128-bit result vector of

+/// [4 x float]. The immediate integer operand controls which input elements

+/// will contribute to the dot product, and where the final results are

+/// returned.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VDPPS / DPPS </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [4 x float].

+/// \param Y

+/// A 128-bit vector of [4 x float].

+/// \param M

+/// An immediate integer operand. Mask bits [7:4] determine which elements

+/// of the input vectors are used, with bit [4] corresponding to the lowest

+/// element and bit [7] corresponding to the highest element of each [4 x

+/// float] vector. If a bit is set, the corresponding elements from the two

+/// input vectors are used as an input for dot product; otherwise that input

+/// is treated as zero. Bits [3:0] determine which elements of the result

+/// will receive a copy of the final dot product, with bit [0] corresponding

+/// to the lowest element and bit [3] corresponding to the highest element of

+/// each [4 x float] subvector. If a bit is set, the dot product is returned

+/// in the corresponding element; otherwise that element is set to zero.

+/// \returns A 128-bit vector of [4 x float] containing the dot product.

#define _mm_dp_ps(X, Y, M) __extension__ ({ \

(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \

(__v4sf)(__m128)(Y), (M)); })

+/// \brief Computes the dot product of the two 128-bit vectors of [2 x double]

+/// and returns it in the elements of the 128-bit result vector of

+/// [2 x double]. The immediate integer operand controls which input

+/// elements will contribute to the dot product, and where the final results

+/// are returned.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VDPPD / DPPD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [2 x double].

+/// \param Y

+/// A 128-bit vector of [2 x double].

+/// \param M

+/// An immediate integer operand. Mask bits [5:4] determine which elements

+/// of the input vectors are used, with bit [4] corresponding to the lowest

+/// element and bit [5] corresponding to the highest element of each of [2 x

+/// double] vector. If a bit is set, the corresponding elements from the two

+/// input vectors are used as an input for dot product; otherwise that input

+/// is treated as zero. Bits [1:0] determine which elements of the result

+/// will receive a copy of the final dot product, with bit [0] corresponding

+/// to the lowest element and bit [3] corresponding to the highest element of

+/// each [2 x double] vector. If a bit is set, the dot product is returned in

+/// the corresponding element; otherwise that element is set to zero.

#define _mm_dp_pd(X, Y, M) __extension__ ({\

(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \

(__v2df)(__m128d)(Y), (M)); })

/* SSE4 Streaming Load Hint Instruction. */

+/// \brief Loads integer values from a 128-bit aligned memory location to a

+/// 128-bit integer vector.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c>

+/// instruction.

+///

+/// \param __V

+/// A pointer to a 128-bit aligned memory location that contains the integer

+/// values.

+/// \returns A 128-bit integer vector containing the data stored at the

+/// specified memory location.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_stream_load_si128 (__m128i const *__V)

{

- return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);

+ return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);

}

/* SSE4 Packed Integer Min/Max Instructions. */

+/// \brief Compares the corresponding elements of two 128-bit vectors of

+/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser

+/// of the two values.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [16 x i8].

+/// \param __V2

+/// A 128-bit vector of [16 x i8]

+/// \returns A 128-bit vector of [16 x i8] containing the lesser values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_min_epi8 (__m128i __V1, __m128i __V2)

{

return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);

}

+/// \brief Compares the corresponding elements of two 128-bit vectors of

+/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the

+/// greater value of the two.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [16 x i8].

+/// \param __V2

+/// A 128-bit vector of [16 x i8].

+/// \returns A 128-bit vector of [16 x i8] containing the greater values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_max_epi8 (__m128i __V1, __m128i __V2)

{

return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);

}

+/// \brief Compares the corresponding elements of two 128-bit vectors of

+/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser

+/// value of the two.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [8 x u16].

+/// \param __V2

+/// A 128-bit vector of [8 x u16].

+/// \returns A 128-bit vector of [8 x u16] containing the lesser values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_min_epu16 (__m128i __V1, __m128i __V2)

{

return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);

}

+/// \brief Compares the corresponding elements of two 128-bit vectors of

+/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the

+/// greater value of the two.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [8 x u16].

+/// \param __V2

+/// A 128-bit vector of [8 x u16].

+/// \returns A 128-bit vector of [8 x u16] containing the greater values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_max_epu16 (__m128i __V1, __m128i __V2)

{

return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);

}

+/// \brief Compares the corresponding elements of two 128-bit vectors of

+/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser

+/// value of the two.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [4 x i32].

+/// \param __V2

+/// A 128-bit vector of [4 x i32].

+/// \returns A 128-bit vector of [4 x i32] containing the lesser values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_min_epi32 (__m128i __V1, __m128i __V2)

{

return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);

}

+/// \brief Compares the corresponding elements of two 128-bit vectors of

+/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the

+/// greater value of the two.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [4 x i32].

+/// \param __V2

+/// A 128-bit vector of [4 x i32].

+/// \returns A 128-bit vector of [4 x i32] containing the greater values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_max_epi32 (__m128i __V1, __m128i __V2)

{

return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);

}

+/// \brief Compares the corresponding elements of two 128-bit vectors of

+/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser

+/// value of the two.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [4 x u32].

+/// \param __V2

+/// A 128-bit vector of [4 x u32].

+/// \returns A 128-bit vector of [4 x u32] containing the lesser values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_min_epu32 (__m128i __V1, __m128i __V2)

{

return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);

}

+/// \brief Compares the corresponding elements of two 128-bit vectors of

+/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the

+/// greater value of the two.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [4 x u32].

+/// \param __V2

+/// A 128-bit vector of [4 x u32].

+/// \returns A 128-bit vector of [4 x u32] containing the greater values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_max_epu32 (__m128i __V1, __m128i __V2)

{

@@ -196,7 +856,70 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)

}

/* SSE4 Insertion and Extraction from XMM Register Instructions. */

+/// \brief Takes the first argument \a X and inserts an element from the second

+/// argument \a Y as selected by the third argument \a N. That result then

+/// has elements zeroed out also as selected by the third argument \a N. The

+/// resulting 128-bit vector of [4 x float] is then returned.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.

+///

+/// \param X

+/// A 128-bit vector source operand of [4 x float]. With the exception of

+/// those bits in the result copied from parameter \a Y and zeroed by bits

+/// [3:0] of \a N, all bits from this parameter are copied to the result.

+/// \param Y

+/// A 128-bit vector source operand of [4 x float]. One single-precision

+/// floating-point element from this source, as determined by the immediate

+/// parameter, is copied to the result.

+/// \param N

+/// Specifies which bits from operand \a Y will be copied, which bits in the

+/// result they will be be copied to, and which bits in the result will be

+/// cleared. The following assignments are made: \n

+/// Bits [7:6] specify the bits to copy from operand \a Y: \n

+/// 00: Selects bits [31:0] from operand \a Y. \n

+/// 01: Selects bits [63:32] from operand \a Y. \n

+/// 10: Selects bits [95:64] from operand \a Y. \n

+/// 11: Selects bits [127:96] from operand \a Y. \n

+/// Bits [5:4] specify the bits in the result to which the selected bits

+/// from operand \a Y are copied: \n

+/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n

+/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n

+/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n

+/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n

+/// Bits[3:0]: If any of these bits are set, the corresponding result

+/// element is cleared.

+/// \returns A 128-bit vector of [4 x float] containing the copied single-

+/// precision floating point elements from the operands.

#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))

+/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and

+/// returns it, using the immediate value parameter \a N as a selector.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_extract_ps(__m128 X, const int N);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [4 x float].

+/// \param N

+/// An immediate value. Bits [1:0] determines which bits from the argument

+/// \a X are extracted and returned: \n

+/// 00: Bits [31:0] of parameter \a X are returned. \n

+/// 01: Bits [63:32] of parameter \a X are returned. \n

+/// 10: Bits [95:64] of parameter \a X are returned. \n

+/// 11: Bits [127:96] of parameter \a X are returned.

+/// \returns A 32-bit integer containing the extracted 32 bits of float data.

#define _mm_extract_ps(X, N) (__extension__ \

({ union { int __i; float __f; } __t; \

__v4sf __a = (__v4sf)(__m128)(X); \

@@ -217,15 +940,113 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)

_MM_MK_INSERTPS_NDX((N), 0, 0x0e))

/* Insert int into packed integer array at index. */

+/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of

+/// the 128-bit integer vector parameter, and then inserting the lower 8 bits

+/// of an integer parameter \a I into an offset specified by the immediate

+/// value parameter \a N.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit integer vector of [16 x i8]. This vector is copied to the

+/// result and then one of the sixteen elements in the result vector is

+/// replaced by the lower 8 bits of \a I.

+/// \param I

+/// An integer. The lower 8 bits of this operand are written to the result

+/// beginning at the offset specified by \a N.

+/// \param N

+/// An immediate value. Bits [3:0] specify the bit offset in the result at

+/// which the lower 8 bits of \a I are written. \n

+/// 0000: Bits [7:0] of the result are used for insertion. \n

+/// 0001: Bits [15:8] of the result are used for insertion. \n

+/// 0010: Bits [23:16] of the result are used for insertion. \n

+/// 0011: Bits [31:24] of the result are used for insertion. \n

+/// 0100: Bits [39:32] of the result are used for insertion. \n

+/// 0101: Bits [47:40] of the result are used for insertion. \n

+/// 0110: Bits [55:48] of the result are used for insertion. \n

+/// 0111: Bits [63:56] of the result are used for insertion. \n

+/// 1000: Bits [71:64] of the result are used for insertion. \n

+/// 1001: Bits [79:72] of the result are used for insertion. \n

+/// 1010: Bits [87:80] of the result are used for insertion. \n

+/// 1011: Bits [95:88] of the result are used for insertion. \n

+/// 1100: Bits [103:96] of the result are used for insertion. \n

+/// 1101: Bits [111:104] of the result are used for insertion. \n

+/// 1110: Bits [119:112] of the result are used for insertion. \n

+/// 1111: Bits [127:120] of the result are used for insertion.

+/// \returns A 128-bit integer vector containing the constructed values.

#define _mm_insert_epi8(X, I, N) (__extension__ \

({ __v16qi __a = (__v16qi)(__m128i)(X); \

__a[(N) & 15] = (I); \

(__m128i)__a;}))

+/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of

+/// the 128-bit integer vector parameter, and then inserting the 32-bit

+/// integer parameter \a I at the offset specified by the immediate value

+/// parameter \a N.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit integer vector of [4 x i32]. This vector is copied to the

+/// result and then one of the four elements in the result vector is

+/// replaced by \a I.

+/// \param I

+/// A 32-bit integer that is written to the result beginning at the offset

+/// specified by \a N.

+/// \param N

+/// An immediate value. Bits [1:0] specify the bit offset in the result at

+/// which the integer \a I is written.

+/// 00: Bits [31:0] of the result are used for insertion. \n

+/// 01: Bits [63:32] of the result are used for insertion. \n

+/// 10: Bits [95:64] of the result are used for insertion. \n

+/// 11: Bits [127:96] of the result are used for insertion.

+/// \returns A 128-bit integer vector containing the constructed values.

#define _mm_insert_epi32(X, I, N) (__extension__ \

({ __v4si __a = (__v4si)(__m128i)(X); \

__a[(N) & 3] = (I); \

(__m128i)__a;}))

#ifdef __x86_64__

+/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of

+/// the 128-bit integer vector parameter, and then inserting the 64-bit

+/// integer parameter \a I, using the immediate value parameter \a N as an

+/// insertion location selector.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit integer vector of [2 x i64]. This vector is copied to the

+/// result and then one of the two elements in the result vector is replaced

+/// by \a I.

+/// \param I

+/// A 64-bit integer that is written to the result beginning at the offset

+/// specified by \a N.

+/// \param N

+/// An immediate value. Bit [0] specifies the bit offset in the result at

+/// which the integer \a I is written.

+/// 0: Bits [63:0] of the result are used for insertion. \n

+/// 1: Bits [127:64] of the result are used for insertion. \n

+/// \returns A 128-bit integer vector containing the constructed values.

#define _mm_insert_epi64(X, I, N) (__extension__ \

({ __v2di __a = (__v2di)(__m128i)(X); \

__a[(N) & 1] = (I); \

@@ -235,42 +1056,228 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)

/* Extract int from packed integer array at index. This returns the element

* as a zero extended value, so it is unsigned.

+/// \brief Extracts an 8-bit element from the 128-bit integer vector of

+/// [16 x i8], using the immediate value parameter \a N as a selector.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_extract_epi8(__m128i X, const int N);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit integer vector.

+/// \param N

+/// An immediate value. Bits [3:0] specify which 8-bit vector element

+/// from the argument \a X to extract and copy to the result. \n

+/// 0000: Bits [7:0] of parameter \a X are extracted. \n

+/// 0001: Bits [15:8] of the parameter \a X are extracted. \n

+/// 0010: Bits [23:16] of the parameter \a X are extracted. \n

+/// 0011: Bits [31:24] of the parameter \a X are extracted. \n

+/// 0100: Bits [39:32] of the parameter \a X are extracted. \n

+/// 0101: Bits [47:40] of the parameter \a X are extracted. \n

+/// 0110: Bits [55:48] of the parameter \a X are extracted. \n

+/// 0111: Bits [63:56] of the parameter \a X are extracted. \n

+/// 1000: Bits [71:64] of the parameter \a X are extracted. \n

+/// 1001: Bits [79:72] of the parameter \a X are extracted. \n

+/// 1010: Bits [87:80] of the parameter \a X are extracted. \n

+/// 1011: Bits [95:88] of the parameter \a X are extracted. \n

+/// 1100: Bits [103:96] of the parameter \a X are extracted. \n

+/// 1101: Bits [111:104] of the parameter \a X are extracted. \n

+/// 1110: Bits [119:112] of the parameter \a X are extracted. \n

+/// 1111: Bits [127:120] of the parameter \a X are extracted.

+/// \returns An unsigned integer, whose lower 8 bits are selected from the

+/// 128-bit integer vector parameter and the remaining bits are assigned

+/// zeros.

#define _mm_extract_epi8(X, N) (__extension__ \

({ __v16qi __a = (__v16qi)(__m128i)(X); \

(int)(unsigned char) __a[(N) & 15];}))

+/// \brief Extracts a 32-bit element from the 128-bit integer vector of

+/// [4 x i32], using the immediate value parameter \a N as a selector.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_extract_epi32(__m128i X, const int N);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit integer vector.

+/// \param N

+/// An immediate value. Bits [1:0] specify which 32-bit vector element

+/// from the argument \a X to extract and copy to the result. \n

+/// 00: Bits [31:0] of the parameter \a X are extracted. \n

+/// 01: Bits [63:32] of the parameter \a X are extracted. \n

+/// 10: Bits [95:64] of the parameter \a X are extracted. \n

+/// 11: Bits [127:96] of the parameter \a X are exracted.

+/// \returns An integer, whose lower 32 bits are selected from the 128-bit

+/// integer vector parameter and the remaining bits are assigned zeros.

#define _mm_extract_epi32(X, N) (__extension__ \

({ __v4si __a = (__v4si)(__m128i)(X); \

(int)__a[(N) & 3];}))

#ifdef __x86_64__

+/// \brief Extracts a 64-bit element from the 128-bit integer vector of

+/// [2 x i64], using the immediate value parameter \a N as a selector.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// long long _mm_extract_epi64(__m128i X, const int N);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit integer vector.

+/// \param N

+/// An immediate value. Bit [0] specifies which 64-bit vector element

+/// from the argument \a X to return. \n

+/// 0: Bits [63:0] are returned. \n

+/// 1: Bits [127:64] are returned. \n

+/// \returns A 64-bit integer.

#define _mm_extract_epi64(X, N) (__extension__ \

({ __v2di __a = (__v2di)(__m128i)(X); \

(long long)__a[(N) & 1];}))

#endif /* __x86_64 */

/* SSE4 128-bit Packed Integer Comparisons. */

+/// \brief Tests whether the specified bits in a 128-bit integer vector are all

+/// zeros.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c>

+/// instruction.

+///

+/// \param __M

+/// A 128-bit integer vector containing the bits to be tested.

+/// \param __V

+/// A 128-bit integer vector selecting which bits to test in operand \a __M.

+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.

static __inline__ int __DEFAULT_FN_ATTRS

_mm_testz_si128(__m128i __M, __m128i __V)

{

return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);

}

+/// \brief Tests whether the specified bits in a 128-bit integer vector are all

+/// ones.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c>

+/// instruction.

+///

+/// \param __M

+/// A 128-bit integer vector containing the bits to be tested.

+/// \param __V

+/// A 128-bit integer vector selecting which bits to test in operand \a __M.

+/// \returns TRUE if the specified bits are all ones; FALSE otherwise.

static __inline__ int __DEFAULT_FN_ATTRS

_mm_testc_si128(__m128i __M, __m128i __V)

{

return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);

}

+/// \brief Tests whether the specified bits in a 128-bit integer vector are

+/// neither all zeros nor all ones.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c>

+/// instruction.

+///

+/// \param __M

+/// A 128-bit integer vector containing the bits to be tested.

+/// \param __V

+/// A 128-bit integer vector selecting which bits to test in operand \a __M.

+/// \returns TRUE if the specified bits are neither all zeros nor all ones;

+/// FALSE otherwise.

static __inline__ int __DEFAULT_FN_ATTRS

_mm_testnzc_si128(__m128i __M, __m128i __V)

{

return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);

}

+/// \brief Tests whether the specified bits in a 128-bit integer vector are all

+/// ones.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_test_all_ones(__m128i V);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c>

+/// instruction.

+///

+/// \param V

+/// A 128-bit integer vector containing the bits to be tested.

+/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE

+/// otherwise.

#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))

+/// \brief Tests whether the specified bits in a 128-bit integer vector are

+/// neither all zeros nor all ones.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c>

+/// instruction.

+///

+/// \param M

+/// A 128-bit integer vector containing the bits to be tested.

+/// \param V

+/// A 128-bit integer vector selecting which bits to test in operand \a M.

+/// \returns TRUE if the specified bits are neither all zeros nor all ones;

+/// FALSE otherwise.

#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))

+/// \brief Tests whether the specified bits in a 128-bit integer vector are all

+/// zeros.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_test_all_zeros(__m128i M, __m128i V);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c>

+/// instruction.

+///

+/// \param M

+/// A 128-bit integer vector containing the bits to be tested.

+/// \param V

+/// A 128-bit integer vector selecting which bits to test in operand \a M.

+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.

#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))

/* SSE4 64-bit Packed Integer Comparisons. */

+/// \brief Compares each of the corresponding 64-bit values of the 128-bit

+/// integer vectors for equality.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit integer vector.

+/// \param __V2

+/// A 128-bit integer vector.

+/// \returns A 128-bit integer vector containing the comparison results.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)

{

@@ -278,6 +1285,20 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)

}

/* SSE4 Packed Integer Sign-Extension. */

+/// \brief Sign-extends each of the lower eight 8-bit integer elements of a

+/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a

+/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector

+/// are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-

+/// extended to 16-bit values.

+/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepi8_epi16(__m128i __V)

{

@@ -286,6 +1307,20 @@ _mm_cvtepi8_epi16(__m128i __V)

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);

}

+/// \brief Sign-extends each of the lower four 8-bit integer elements of a

+/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a

+/// 128-bit vector of [4 x i32]. The upper twelve elements of the input

+/// vector are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign-

+/// extended to 32-bit values.

+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepi8_epi32(__m128i __V)

{

@@ -294,6 +1329,20 @@ _mm_cvtepi8_epi32(__m128i __V)

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);

}

+/// \brief Sign-extends each of the lower two 8-bit integer elements of a

+/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in

+/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input

+/// vector are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign-

+/// extended to 64-bit values.

+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepi8_epi64(__m128i __V)

{

@@ -302,18 +1351,60 @@ _mm_cvtepi8_epi64(__m128i __V)

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);

}

+/// \brief Sign-extends each of the lower four 16-bit integer elements of a

+/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in

+/// a 128-bit vector of [4 x i32]. The upper four elements of the input

+/// vector are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign-

+/// extended to 32-bit values.

+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepi16_epi32(__m128i __V)

{

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);

}

+/// \brief Sign-extends each of the lower two 16-bit integer elements of a

+/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in

+/// a 128-bit vector of [2 x i64]. The upper six elements of the input

+/// vector are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign-

+/// extended to 64-bit values.

+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepi16_epi64(__m128i __V)

{

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);

}

+/// \brief Sign-extends each of the lower two 32-bit integer elements of a

+/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in

+/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector

+/// are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign-

+/// extended to 64-bit values.

+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepi32_epi64(__m128i __V)

{

@@ -321,36 +1412,120 @@ _mm_cvtepi32_epi64(__m128i __V)

}

/* SSE4 Packed Integer Zero-Extension. */

+/// \brief Zero-extends each of the lower eight 8-bit integer elements of a

+/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a

+/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector

+/// are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero-

+/// extended to 16-bit values.

+/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepu8_epi16(__m128i __V)

{

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);

}

+/// \brief Zero-extends each of the lower four 8-bit integer elements of a

+/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a

+/// 128-bit vector of [4 x i32]. The upper twelve elements of the input

+/// vector are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero-

+/// extended to 32-bit values.

+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepu8_epi32(__m128i __V)

{

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);

}

+/// \brief Zero-extends each of the lower two 8-bit integer elements of a

+/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in

+/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input

+/// vector are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero-

+/// extended to 64-bit values.

+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepu8_epi64(__m128i __V)

{

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);

}

+/// \brief Zero-extends each of the lower four 16-bit integer elements of a

+/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in

+/// a 128-bit vector of [4 x i32]. The upper four elements of the input

+/// vector are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero-

+/// extended to 32-bit values.

+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepu16_epi32(__m128i __V)

{

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);

}

+/// \brief Zero-extends each of the lower two 16-bit integer elements of a

+/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in

+/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector

+/// are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero-

+/// extended to 64-bit values.

+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepu16_epi64(__m128i __V)

{

return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);

}

+/// \brief Zero-extends each of the lower two 32-bit integer elements of a

+/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in

+/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector

+/// are unused.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero-

+/// extended to 64-bit values.

+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cvtepu32_epi64(__m128i __V)

{

@@ -358,6 +1533,29 @@ _mm_cvtepu32_epi64(__m128i __V)

}

/* SSE4 Pack with Unsigned Saturation. */

+/// \brief Converts 32-bit signed integers from both 128-bit integer vector

+/// operands into 16-bit unsigned integers, and returns the packed result.

+/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than

+/// 0x0000 are saturated to 0x0000.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a

+/// signed integer and is converted to a 16-bit unsigned integer with

+/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values

+/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values

+/// are written to the lower 64 bits of the result.

+/// \param __V2

+/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a

+/// signed integer and is converted to a 16-bit unsigned integer with

+/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values

+/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values

+/// are written to the higher 64 bits of the result.

+/// \returns A 128-bit vector of [8 x i16] containing the converted values.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_packus_epi32(__m128i __V1, __m128i __V2)

{

@@ -365,10 +1563,59 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)

}

/* SSE4 Multiple Packed Sums of Absolute Difference. */

+/// \brief Subtracts 8-bit unsigned integer values and computes the absolute

+/// values of the differences to the corresponding bits in the destination.

+/// Then sums of the absolute differences are returned according to the bit

+/// fields in the immediate operand.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c>

+/// instruction.

+///

+/// \param X

+/// A 128-bit vector of [16 x i8].

+/// \param Y

+/// A 128-bit vector of [16 x i8].

+/// \param M

+/// An 8-bit immediate operand specifying how the absolute differences are to

+/// be calculated, according to the following algorithm:

+/// \code

+/// // M2 represents bit 2 of the immediate operand

+/// // M10 represents bits [1:0] of the immediate operand

+/// i = M2 * 4

+/// j = M10 * 4

+/// for (k = 0; k < 8; k = k + 1) {

+/// d0 = abs(X[i + k + 0] - Y[j + 0])

+/// d1 = abs(X[i + k + 1] - Y[j + 1])

+/// d2 = abs(X[i + k + 2] - Y[j + 2])

+/// d3 = abs(X[i + k + 3] - Y[j + 3])

+/// r[k] = d0 + d1 + d2 + d3

+/// }

+/// \endcode

+/// \returns A 128-bit integer vector containing the sums of the sets of

+/// absolute differences between both operands.

#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \

(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \

(__v16qi)(__m128i)(Y), (M)); })

+/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit

+/// vector of [8 x u16] and returns it and along with its index.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>

+/// instruction.

+///

+/// \param __V

+/// A 128-bit vector of [8 x u16].

+/// \returns A 128-bit value where bits [15:0] contain the minimum value found

+/// in parameter \a __V, bits [18:16] contain the index of the minimum value

+/// and the remaining bits are set to 0.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_minpos_epu16(__m128i __V)

{

@@ -410,61 +1657,769 @@ _mm_minpos_epu16(__m128i __V)

#define _SIDD_UNIT_MASK 0x40

/* SSE4.2 Packed Comparison Intrinsics. */

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with implicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns a 128-bit integer vector representing the result

+/// mask of the comparison.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words, the type of comparison to perform, and the format of the return

+/// value. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// Bit [6]: Determines whether the result is zero-extended or expanded to 16

+/// bytes. \n

+/// 0: The result is zero-extended to 16 bytes. \n

+/// 1: The result is expanded to 16 bytes (this expansion is performed by

+/// repeating each bit 8 or 16 times).

+/// \returns Returns a 128-bit integer vector representing the result mask of

+/// the comparison.

#define _mm_cmpistrm(A, B, M) \

(__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \

(__v16qi)(__m128i)(B), (int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with implicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns an integer representing the result index of the

+/// comparison.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpistri(__m128i A, __m128i B, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words, the type of comparison to perform, and the format of the return

+/// value. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// Bit [6]: Determines whether the index of the lowest set bit or the

+/// highest set bit is returned. \n

+/// 0: The index of the least significant set bit. \n

+/// 1: The index of the most significant set bit. \n

+/// \returns Returns an integer representing the result index of the comparison.

#define _mm_cmpistri(A, B, M) \

(int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \

(__v16qi)(__m128i)(B), (int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with explicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns a 128-bit integer vector representing the result

+/// mask of the comparison.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LA

+/// An integer that specifies the length of the string in \a A.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LB

+/// An integer that specifies the length of the string in \a B.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words, the type of comparison to perform, and the format of the return

+/// value. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// Bit [6]: Determines whether the result is zero-extended or expanded to 16

+/// bytes. \n

+/// 0: The result is zero-extended to 16 bytes. \n

+/// 1: The result is expanded to 16 bytes (this expansion is performed by

+/// repeating each bit 8 or 16 times). \n

+/// \returns Returns a 128-bit integer vector representing the result mask of

+/// the comparison.

#define _mm_cmpestrm(A, LA, B, LB, M) \

(__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \

(__v16qi)(__m128i)(B), (int)(LB), \

(int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with explicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns an integer representing the result index of the

+/// comparison.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LA

+/// An integer that specifies the length of the string in \a A.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LB

+/// An integer that specifies the length of the string in \a B.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words, the type of comparison to perform, and the format of the return

+/// value. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// Bit [6]: Determines whether the index of the lowest set bit or the

+/// highest set bit is returned. \n

+/// 0: The index of the least significant set bit. \n

+/// 1: The index of the most significant set bit. \n

+/// \returns Returns an integer representing the result index of the comparison.

#define _mm_cmpestri(A, LA, B, LB, M) \

(int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \

(__v16qi)(__m128i)(B), (int)(LB), \

(int)(M))

/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with implicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the

+/// string in \a B is the maximum, otherwise, returns 0.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpistra(__m128i A, __m128i B, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// \returns Returns 1 if the bit mask is zero and the length of the string in

+/// \a B is the maximum; otherwise, returns 0.

#define _mm_cmpistra(A, B, M) \

(int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \

(__v16qi)(__m128i)(B), (int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with implicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns

+/// 0.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B.

+/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.

#define _mm_cmpistrc(A, B, M) \

(int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \

(__v16qi)(__m128i)(B), (int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with implicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns bit 0 of the resulting bit mask.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpistro(__m128i A, __m128i B, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// \returns Returns bit 0 of the resulting bit mask.

#define _mm_cmpistro(A, B, M) \

(int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \

(__v16qi)(__m128i)(B), (int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with implicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns 1 if the length of the string in \a A is less than

+/// the maximum, otherwise, returns 0.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// \returns Returns 1 if the length of the string in \a A is less than the

+/// maximum, otherwise, returns 0.

#define _mm_cmpistrs(A, B, M) \

(int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \

(__v16qi)(__m128i)(B), (int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with implicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns 1 if the length of the string in \a B is less than

+/// the maximum, otherwise, returns 0.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B.

+/// \returns Returns 1 if the length of the string in \a B is less than the

+/// maximum, otherwise, returns 0.

#define _mm_cmpistrz(A, B, M) \

(int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \

(__v16qi)(__m128i)(B), (int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with explicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the

+/// string in \a B is the maximum, otherwise, returns 0.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LA

+/// An integer that specifies the length of the string in \a A.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LB

+/// An integer that specifies the length of the string in \a B.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B.

+/// \returns Returns 1 if the bit mask is zero and the length of the string in

+/// \a B is the maximum, otherwise, returns 0.

#define _mm_cmpestra(A, LA, B, LB, M) \

(int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \

(__v16qi)(__m128i)(B), (int)(LB), \

(int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with explicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,

+/// returns 0.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LA

+/// An integer that specifies the length of the string in \a A.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LB

+/// An integer that specifies the length of the string in \a B.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.

#define _mm_cmpestrc(A, LA, B, LB, M) \

(int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \

(__v16qi)(__m128i)(B), (int)(LB), \

(int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with explicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns bit 0 of the resulting bit mask.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LA

+/// An integer that specifies the length of the string in \a A.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LB

+/// An integer that specifies the length of the string in \a B.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B.

+/// \returns Returns bit 0 of the resulting bit mask.

#define _mm_cmpestro(A, LA, B, LB, M) \

(int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \

(__v16qi)(__m128i)(B), (int)(LB), \

(int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with explicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns 1 if the length of the string in \a A is less than

+/// the maximum, otherwise, returns 0.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>

+/// instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LA

+/// An integer that specifies the length of the string in \a A.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LB

+/// An integer that specifies the length of the string in \a B.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement in the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B. \n

+/// \returns Returns 1 if the length of the string in \a A is less than the

+/// maximum, otherwise, returns 0.

#define _mm_cmpestrs(A, LA, B, LB, M) \

(int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \

(__v16qi)(__m128i)(B), (int)(LB), \

(int)(M))

+/// \brief Uses the immediate operand \a M to perform a comparison of string

+/// data with explicitly defined lengths that is contained in source operands

+/// \a A and \a B. Returns 1 if the length of the string in \a B is less than

+/// the maximum, otherwise, returns 0.

+///

+/// \headerfile <x86intrin.h>

+///

+/// \code

+/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);

+/// \endcode

+///

+/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.

+///

+/// \param A

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LA

+/// An integer that specifies the length of the string in \a A.

+/// \param B

+/// A 128-bit integer vector containing one of the source operands to be

+/// compared.

+/// \param LB

+/// An integer that specifies the length of the string in \a B.

+/// \param M

+/// An 8-bit immediate operand specifying whether the characters are bytes or

+/// words and the type of comparison to perform. \n

+/// Bits [1:0]: Determine source data format. \n

+/// 00: 16 unsigned bytes \n

+/// 01: 8 unsigned words \n

+/// 10: 16 signed bytes \n

+/// 11: 8 signed words \n

+/// Bits [3:2]: Determine comparison type and aggregation method. \n

+/// 00: Subset: Each character in \a B is compared for equality with all

+/// the characters in \a A. \n

+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison

+/// basis is greater than or equal for even-indexed elements in \a A,

+/// and less than or equal for odd-indexed elements in \a A. \n

+/// 10: Match: Compare each pair of corresponding characters in \a A and

+/// \a B for equality. \n

+/// 11: Substring: Search \a B for substring matches of \a A. \n

+/// Bits [5:4]: Determine whether to perform a one's complement on the bit

+/// mask of the comparison results. \n

+/// 00: No effect. \n

+/// 01: Negate the bit mask. \n

+/// 10: No effect. \n

+/// 11: Negate the bit mask only for bits with an index less than or equal

+/// to the size of \a A or \a B.

+/// \returns Returns 1 if the length of the string in \a B is less than the

+/// maximum, otherwise, returns 0.

#define _mm_cmpestrz(A, LA, B, LB, M) \

(int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \

(__v16qi)(__m128i)(B), (int)(LB), \

(int)(M))

/* SSE4.2 Compare Packed Data -- Greater Than. */

+/// \brief Compares each of the corresponding 64-bit values of the 128-bit

+/// integer vectors to determine if the values in the first operand are

+/// greater than those in the second operand.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c>

+/// instruction.

+///

+/// \param __V1

+/// A 128-bit integer vector.

+/// \param __V2

+/// A 128-bit integer vector.

+/// \returns A 128-bit integer vector containing the comparison results.

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)

{

@@ -472,18 +2427,60 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)

}

/* SSE4.2 Accumulate CRC32. */

+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the

+/// unsigned char operand.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> CRC32B </c> instruction.

+///

+/// \param __C

+/// An unsigned integer operand to add to the CRC-32C checksum of operand

+/// \a __D.

+/// \param __D

+/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.

+/// \returns The result of adding operand \a __C to the CRC-32C checksum of

+/// operand \a __D.

static __inline__ unsigned int __DEFAULT_FN_ATTRS

_mm_crc32_u8(unsigned int __C, unsigned char __D)

{

return __builtin_ia32_crc32qi(__C, __D);

}

+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the

+/// unsigned short operand.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> CRC32W </c> instruction.

+///

+/// \param __C

+/// An unsigned integer operand to add to the CRC-32C checksum of operand

+/// \a __D.

+/// \param __D

+/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.

+/// \returns The result of adding operand \a __C to the CRC-32C checksum of

+/// operand \a __D.

static __inline__ unsigned int __DEFAULT_FN_ATTRS

_mm_crc32_u16(unsigned int __C, unsigned short __D)

{

return __builtin_ia32_crc32hi(__C, __D);

}

+/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of

+/// the second unsigned integer operand.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> CRC32L </c> instruction.

+///

+/// \param __C

+/// An unsigned integer operand to add to the CRC-32C checksum of operand

+/// \a __D.

+/// \param __D

+/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.

+/// \returns The result of adding operand \a __C to the CRC-32C checksum of

+/// operand \a __D.

static __inline__ unsigned int __DEFAULT_FN_ATTRS

_mm_crc32_u32(unsigned int __C, unsigned int __D)

{

@@ -491,6 +2488,20 @@ _mm_crc32_u32(unsigned int __C, unsigned int __D)

}

#ifdef __x86_64__

+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the

+/// unsigned 64-bit integer operand.

+///

+/// \headerfile <x86intrin.h>

+///

+/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.

+///

+/// \param __C

+/// An unsigned integer operand to add to the CRC-32C checksum of operand

+/// \a __D.

+/// \param __D

+/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.

+/// \returns The result of adding operand \a __C to the CRC-32C checksum of

+/// operand \a __D.

static __inline__ unsigned long long __DEFAULT_FN_ATTRS

_mm_crc32_u64(unsigned long long __C, unsigned long long __D)

{

diff --git a/lib/Headers/stdarg.h b/lib/Headers/stdarg.h
index a57e18364871..101426fff151 100644
--- a/lib/Headers/stdarg.h
+++ b/lib/Headers/stdarg.h

@@ -43,10 +43,9 @@ typedef __builtin_va_list va_list;

#define va_copy(dest, src) __builtin_va_copy(dest, src)

#endif

-/* Hack required to make standard headers work, at least on Ubuntu */

#ifndef __GNUC_VA_LIST

#define __GNUC_VA_LIST 1

-#endif

typedef __builtin_va_list __gnuc_va_list;

+#endif

#endif /* __STDARG_H */

diff --git a/lib/Headers/tgmath.h b/lib/Headers/tgmath.h
index 318e1185feee..34e26dcc05ec 100644
--- a/lib/Headers/tgmath.h
+++ b/lib/Headers/tgmath.h

@@ -22,12 +22,21 @@

\*===----------------------------------------------------------------------===*/

-#ifndef __TGMATH_H

-#define __TGMATH_H

+#ifndef __CLANG_TGMATH_H

+#define __CLANG_TGMATH_H

/* C99 7.22 Type-generic math <tgmath.h>. */

#include <math.h>

+/*

+ * Allow additional definitions and implementation-defined values on Apple

+ * platforms. This is done after #include <math.h> to avoid depcycle conflicts

+ * between libcxx and darwin in C++ modules builds.

+ */

+#if defined(__APPLE__) && __STDC_HOSTED__ && __has_include_next(<tgmath.h>)

+# include_next <tgmath.h>

+#else

/* C++ handles type genericity with overloading in math.h. */

#ifndef __cplusplus

#include <complex.h>

@@ -1371,4 +1380,5 @@ static long double

#undef _TG_ATTRS

#endif /* __cplusplus */

-#endif /* __TGMATH_H */

+#endif /* __has_include_next */

+#endif /* __CLANG_TGMATH_H */

diff --git a/lib/Headers/x86intrin.h b/lib/Headers/x86intrin.h
index 81a404f55d01..2003029cb5a8 100644
--- a/lib/Headers/x86intrin.h
+++ b/lib/Headers/x86intrin.h

@@ -80,6 +80,10 @@

#include <mwaitxintrin.h>

#endif

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLZERO__)

+#include <clzerointrin.h>

+#endif

/* FIXME: LWP */

#endif /* __X86INTRIN_H */

diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index dc31b85cfd7c..bb8e29cd9052 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h

@@ -2067,7 +2067,7 @@ _mm_storer_ps(float *__p, __m128 __a)

/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will

/// be generated. \n

/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will

-/// be generated.

+/// be generated.

#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))

#endif

@@ -2435,17 +2435,17 @@ extern "C" {

/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,

/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.

/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().

-/// </li>

+/// </li>

/// <li>

/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,

/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper

/// _MM_GET_ROUNDING_MODE(x) where x is one of these macros.

/// </li>

-/// <li>

+/// <li>

/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.

/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().

/// </li>

-/// <li>

+/// <li>

/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,

/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper

/// _MM_GET_DENORMALS_ZERO_MODE().

@@ -2468,11 +2468,11 @@ extern "C" {

unsigned int _mm_getcsr(void);

/// \brief Sets the MXCSR register with the 32-bit unsigned integer value.

-///

+///

/// There are several groups of macros associated with this intrinsic,

/// including:

/// <ul>

-/// <li>

+/// <li>

/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,

/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,

/// _MM_EXCEPT_INEXACT. There is a convenience wrapper

@@ -2517,7 +2517,7 @@ unsigned int _mm_getcsr(void);

///

/// \param __i

/// A 32-bit unsigned integer value to be written to the MXCSR register.

-void _mm_setcsr(unsigned int);

+void _mm_setcsr(unsigned int __i);

#if defined(__cplusplus)

} // extern "C"

diff --git a/lib/Headers/xopintrin.h b/lib/Headers/xopintrin.h
index bdf0cec32645..4a34f770d58d 100644
--- a/lib/Headers/xopintrin.h
+++ b/lib/Headers/xopintrin.h

@@ -198,13 +198,13 @@ _mm_hsubq_epi32(__m128i __A)

static __inline__ __m128i __DEFAULT_FN_ATTRS

_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)

{

- return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);

+ return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));

}

static __inline__ __m256i __DEFAULT_FN_ATTRS

_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)

{

- return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);

+ return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));

}

static __inline__ __m128i __DEFAULT_FN_ATTRS