38 files changed, 13669 insertions, 5653 deletions
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt
index fa2d2107781b..efc4dd0971b6 100644
--- a/lib/Headers/CMakeLists.txt
+++ b/lib/Headers/CMakeLists.txt
@@ -3,6 +3,7 @@ set(files
   altivec.h
   ammintrin.h
   arm_acle.h
+  armintr.h
   avx2intrin.h
   avx512bwintrin.h
   avx512cdintrin.h
@@ -21,12 +22,13 @@ set(files
   avxintrin.h
   bmi2intrin.h
   bmiintrin.h
+  __clang_cuda_builtin_vars.h
   __clang_cuda_cmath.h
+  __clang_cuda_complex_builtins.h
   __clang_cuda_intrinsics.h
   __clang_cuda_math_forward_declares.h
   __clang_cuda_runtime_wrapper.h
   cpuid.h
-  cuda_builtin_vars.h
   clflushoptintrin.h
   emmintrin.h
   f16cintrin.h
@@ -88,6 +90,12 @@ set(files
   xtestintrin.h
   )
 
+set(cuda_wrapper_files
+  cuda_wrappers/algorithm
+  cuda_wrappers/complex
+  cuda_wrappers/new
+)
+
 set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
 
 # Generate arm_neon.h
@@ -95,7 +103,7 @@ clang_tablegen(arm_neon.h -gen-arm-neon
   SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
 
 set(out_files)
-foreach( f ${files} )
+foreach( f ${files} ${cuda_wrapper_files} )
   set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
   set( dst ${output_dir}/${f} )
   add_custom_command(OUTPUT ${dst}
@@ -120,6 +128,12 @@ install(
   PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
   DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
 
+install(
+  FILES ${cuda_wrapper_files}
+  COMPONENT clang-headers
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include/cuda_wrappers)
+
 if (NOT CMAKE_CONFIGURATION_TYPES) # don't add this for IDE's.
   add_custom_target(install-clang-headers
     DEPENDS clang-headers
diff --git a/lib/Headers/cuda_builtin_vars.h b/lib/Headers/__clang_cuda_builtin_vars.h
index 6f5eb9c78d85..6f5eb9c78d85 100644
--- a/lib/Headers/cuda_builtin_vars.h
+++ b/lib/Headers/__clang_cuda_builtin_vars.h
diff --git a/lib/Headers/__clang_cuda_cmath.h b/lib/Headers/__clang_cuda_cmath.h
index ae7ff2f8d306..0eaa08b30cab 100644
--- a/lib/Headers/__clang_cuda_cmath.h
+++ b/lib/Headers/__clang_cuda_cmath.h
@@ -26,13 +26,15 @@
 #error "This file is for CUDA compilation only."
 #endif
 
+#include <limits>
+
 // CUDA lets us use various std math functions on the device side.  This file
 // works in concert with __clang_cuda_math_forward_declares.h to make this work.
 //
 // Specifically, the forward-declares header declares __device__ overloads for
 // these functions in the global namespace, then pulls them into namespace std
 // with 'using' statements.  Then this file implements those functions, after
-// the implementations have been pulled in.
+// their implementations have been pulled in.
 //
 // It's important that we declare the functions in the global namespace and pull
 // them into namespace std with using statements, as opposed to simply declaring
@@ -73,7 +75,10 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
+// For inscrutable reasons, __finite(), the double-precision version of
+// __finitef, does not exist when compiling for MacOS.  __isfinited is available
+// everywhere and is just as good.
+__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
 __DEVICE__ bool isgreater(float __x, float __y) {
   return __builtin_isgreater(__x, __y);
 }
@@ -120,12 +125,15 @@ __DEVICE__ float ldexp(float __arg, int __exp) {
 __DEVICE__ float log(float __x) { return ::logf(__x); }
 __DEVICE__ float log10(float __x) { return ::log10f(__x); }
 __DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
-__DEVICE__ float nexttoward(float __from, float __to) {
+__DEVICE__ float nexttoward(float __from, double __to) {
   return __builtin_nexttowardf(__from, __to);
 }
 __DEVICE__ double nexttoward(double __from, double __to) {
   return __builtin_nexttoward(__from, __to);
 }
+__DEVICE__ float nexttowardf(float __from, double __to) {
+  return __builtin_nexttowardf(__from, __to);
+}
 __DEVICE__ float pow(float __base, float __exp) {
   return ::powf(__base, __exp);
 }
@@ -136,13 +144,338 @@ __DEVICE__ double pow(double __base, int __iexp) {
   return ::powi(__base, __iexp);
 }
 __DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
 __DEVICE__ float sin(float __x) { return ::sinf(__x); }
 __DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
 __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
 __DEVICE__ float tan(float __x) { return ::tanf(__x); }
 __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
 
+// Now we've defined everything we promised we'd define in
+// __clang_cuda_math_forward_declares.h.  We need to do two additional things to
+// fix up our math functions.
+//
+// 1) Define __device__ overloads for e.g. sin(int).  The CUDA headers define
+//    only sin(float) and sin(double), which means that e.g. sin(0) is
+//    ambiguous.
+//
+// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
+//    std.  These are defined in the CUDA headers in the global namespace,
+//    independent of everything else we've done here.
+
+// We can't use std::enable_if, because we want to be pre-C++11 compatible.  But
+// we go ahead and unconditionally define functions that are only available when
+// compiling for C++11 to match the behavior of the CUDA headers.
+template<bool __B, class __T = void>
+struct __clang_cuda_enable_if {};
+
+template <class __T> struct __clang_cuda_enable_if<true, __T> {
+  typedef __T type;
+};
+
+// Defines an overload of __fn that accepts one integral argument, calls
+// __fn((double)x), and returns __retty.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn)                      \
+  template <typename __T>                                                      \
+  __DEVICE__                                                                   \
+      typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+      __fn(__T __x) {                                                          \
+    return ::__fn((double)__x);                                                \
+  }
+
+// Defines an overload of __fn that accepts one two arithmetic arguments, calls
+// __fn((double)x, (double)y), and returns a double.
+//
+// Note this is different from OVERLOAD_1, which generates an overload that
+// accepts only *integral* arguments.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn)                      \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__ typename __clang_cuda_enable_if<                                  \
+      std::numeric_limits<__T1>::is_specialized &&                             \
+          std::numeric_limits<__T2>::is_specialized,                           \
+      __retty>::type                                                           \
+  __fn(__T1 __x, __T2 __y) {                                                   \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
+
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
+
+// Overloads for functions that don't match the patterns expected by
+// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized &&
+        std::numeric_limits<__T3>::is_specialized,
+    double>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
+  return std::fma((double)__x, (double)__y, (double)__z);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+frexp(__T __x, int *__exp) {
+  return std::frexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+ldexp(__T __x, int __exp) {
+  return std::ldexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+nexttoward(__T __from, double __to) {
+  return std::nexttoward((double)__from, __to);
+}
+
+template <typename __T1, typename __T2>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized,
+    double>::type
+remquo(__T1 __x, __T2 __y, int *__quo) {
+  return std::remquo((double)__x, (double)__y, __quo);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbln(__T __x, long __exp) {
+  return std::scalbln((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbn(__T __x, int __exp) {
+  return std::scalbn((double)__x, __exp);
+}
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// Pull the new overloads we defined above into namespace std.
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::llround;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::nearbyint;
+using ::nextafter;
+using ::nexttoward;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+// Well this is fun: We need to pull these symbols in for libc++, but we can't
+// pull them in with libstdc++, because its ::isinf and ::isnan are different
+// than its std::isinf and std::isnan.
+#ifndef __GLIBCXX__
+using ::isinf;
+using ::isnan;
+#endif
+
+// Finally, pull the "foobarf" functions that CUDA defines in its headers into
+// namespace std.
+using ::acosf;
+using ::acoshf;
+using ::asinf;
+using ::asinhf;
+using ::atan2f;
+using ::atanf;
+using ::atanhf;
+using ::cbrtf;
+using ::ceilf;
+using ::copysignf;
+using ::cosf;
+using ::coshf;
+using ::erfcf;
+using ::erff;
+using ::exp2f;
+using ::expf;
+using ::expm1f;
+using ::fabsf;
+using ::fdimf;
+using ::floorf;
+using ::fmaf;
+using ::fmaxf;
+using ::fminf;
+using ::fmodf;
+using ::frexpf;
+using ::hypotf;
+using ::ilogbf;
+using ::ldexpf;
+using ::lgammaf;
+using ::llrintf;
+using ::llroundf;
+using ::log10f;
+using ::log1pf;
+using ::log2f;
+using ::logbf;
+using ::logf;
+using ::lrintf;
+using ::lroundf;
+using ::modff;
+using ::nearbyintf;
+using ::nextafterf;
+using ::nexttowardf;
+using ::nexttowardf;
+using ::powf;
+using ::remainderf;
+using ::remquof;
+using ::rintf;
+using ::roundf;
+using ::scalblnf;
+using ::scalbnf;
+using ::sinf;
+using ::sinhf;
+using ::sqrtf;
+using ::tanf;
+using ::tanhf;
+using ::tgammaf;
+using ::truncf;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
 #undef __DEVICE__
 
 #endif
diff --git a/lib/Headers/__clang_cuda_complex_builtins.h b/lib/Headers/__clang_cuda_complex_builtins.h
new file mode 100644
index 000000000000..beef7deff87f
--- /dev/null
+++ b/lib/Headers/__clang_cuda_complex_builtins.h
@@ -0,0 +1,203 @@
+/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
+#define __CLANG_CUDA_COMPLEX_BUILTINS
+
+// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3.  These are
+// libgcc functions that clang assumes are available when compiling c99 complex
+// operations.  (These implementations come from libc++, and have been modified
+// to work with CUDA.)
+
+extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
+                                                      double __c, double __d) {
+  double __ac = __a * __c;
+  double __bd = __b * __d;
+  double __ad = __a * __d;
+  double __bc = __b * __c;
+  double _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
+      // a device overload (and isn't constexpr before C++11, naturally).
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  float __ac = __a * __c;
+  float __bd = __b * __d;
+  float __ad = __a * __d;
+  float __bc = __b * __c;
+  float _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
+                                                      double __c, double __d) {
+  int __ilogbw = 0;
+  // Can't use std::max, because that's defined in <algorithm>, and we don't
+  // want to pull that in for every compile.  The CUDA headers define
+  // ::max(float, float) and ::max(double, double), which is sufficient for us.
+  double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  double __denom = __c * __c + __d * __d;
+  double _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
+      __real__(z) = 0.0 * (__a * __c + __b * __d);
+      __imag__(z) = 0.0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  int __ilogbw = 0;
+  float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  float __denom = __c * __c + __d * __d;
+  float _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      __real__(z) = 0 * (__a * __c + __b * __d);
+      __imag__(z) = 0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+#endif // __CLANG_CUDA_COMPLEX_BUILTINS
diff --git a/lib/Headers/__clang_cuda_math_forward_declares.h b/lib/Headers/__clang_cuda_math_forward_declares.h
index 3f2834d95000..49c805151d65 100644
--- a/lib/Headers/__clang_cuda_math_forward_declares.h
+++ b/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -140,6 +140,7 @@ __DEVICE__ long lrint(double);
 __DEVICE__ long lrint(float);
 __DEVICE__ long lround(double);
 __DEVICE__ long lround(float);
+__DEVICE__ long long llround(float); // No llround(double).
 __DEVICE__ double modf(double, double *);
 __DEVICE__ float modf(float, float *);
 __DEVICE__ double nan(const char *);
@@ -149,7 +150,8 @@ __DEVICE__ float nearbyint(float);
 __DEVICE__ double nextafter(double, double);
 __DEVICE__ float nextafter(float, float);
 __DEVICE__ double nexttoward(double, double);
-__DEVICE__ float nexttoward(float, float);
+__DEVICE__ float nexttoward(float, double);
+__DEVICE__ float nexttowardf(float, double);
 __DEVICE__ double pow(double, double);
 __DEVICE__ double pow(double, int);
 __DEVICE__ float pow(float, float);
@@ -183,7 +185,19 @@ __DEVICE__ float tgamma(float);
 __DEVICE__ double trunc(double);
 __DEVICE__ float trunc(float);
 
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
 namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
 using ::abs;
 using ::acos;
 using ::acosh;
@@ -235,6 +249,7 @@ using ::log2;
 using ::logb;
 using ::lrint;
 using ::lround;
+using ::llround;
 using ::modf;
 using ::nan;
 using ::nanf;
@@ -256,7 +271,15 @@ using ::tan;
 using ::tanh;
 using ::tgamma;
 using ::trunc;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
 } // namespace std
+#endif
 
 #pragma pop_macro("__DEVICE__")
 
diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h
index 6445f9b76b8f..205e15b40b5d 100644
--- a/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -62,7 +62,7 @@
 #include "cuda.h"
 #if !defined(CUDA_VERSION)
 #error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
 #error "Unsupported CUDA version!"
 #endif
 
@@ -72,9 +72,9 @@
 #define __CUDA_ARCH__ 350
 #endif
 
-#include "cuda_builtin_vars.h"
+#include "__clang_cuda_builtin_vars.h"
 
-// No need for device_launch_parameters.h as cuda_builtin_vars.h above
+// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
 // has taken care of builtin variables declared in the file.
 #define __DEVICE_LAUNCH_PARAMETERS_H__
 
@@ -113,6 +113,7 @@
 #undef __cxa_vec_ctor
 #undef __cxa_vec_cctor
 #undef __cxa_vec_dtor
+#undef __cxa_vec_new
 #undef __cxa_vec_new2
 #undef __cxa_vec_new3
 #undef __cxa_vec_delete2
@@ -120,6 +121,15 @@
 #undef __cxa_vec_delete3
 #undef __cxa_pure_virtual
 
+// math_functions.hpp expects this host function be defined on MacOS, but it
+// ends up not being there because of the games we play here.  Just define it
+// ourselves; it's simple enough.
+#ifdef __APPLE__
+inline __host__ double __signbitd(double x) {
+  return std::signbit(x);
+}
+#endif
+
 // We need decls for functions in CUDA's libdevice with __device__
 // attribute only. Alas they come either as __host__ __device__ or
 // with no attributes at all. To work around that, define __CUDA_RTC__
@@ -135,6 +145,21 @@
 // the headers we're about to include.
 #define __host__ UNEXPECTED_HOST_ATTRIBUTE
 
+// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
+// Previous versions used to check whether they are defined or not.
+// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
+// here to detect the switch.
+
+#if defined(CU_DEVICE_INVALID)
+#if !defined(__USE_FAST_MATH__)
+#define __USE_FAST_MATH__ 0
+#endif
+
+#if !defined(__CUDA_PREC_DIV)
+#define __CUDA_PREC_DIV 0
+#endif
+#endif
+
 // device_functions.hpp and math_functions*.hpp use 'static
 // __forceinline__' (with no __device__) for definitions of device
 // functions. Temporarily redefine __forceinline__ to include
@@ -151,7 +176,7 @@
 // slow divides), so we need to scope our define carefully here.
 #pragma push_macro("__USE_FAST_MATH__")
 #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
-#define __USE_FAST_MATH__
+#define __USE_FAST_MATH__ 1
 #endif
 #include "math_functions.hpp"
 #pragma pop_macro("__USE_FAST_MATH__")
@@ -267,8 +292,8 @@ __device__ static inline void *malloc(size_t __size) {
 }
 } // namespace std
 
-// Out-of-line implementations from cuda_builtin_vars.h.  These need to come
-// after we've pulled in the definition of uint3 and dim3.
+// Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
+// come after we've pulled in the definition of uint3 and dim3.
 
 __device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
   uint3 ret;
@@ -296,13 +321,14 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
 
 #include <__clang_cuda_cmath.h>
 #include <__clang_cuda_intrinsics.h>
+#include <__clang_cuda_complex_builtins.h>
 
 // curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
 // mode, giving them their "proper" types of dim3 and uint3.  This is
-// incompatible with the types we give in cuda_builtin_vars.h.  As as hack,
-// force-include the header (nvcc doesn't include it by default) but redefine
-// dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are only
-// used here for the redeclarations of blockDim and threadIdx.)
+// incompatible with the types we give in __clang_cuda_builtin_vars.h.  As as
+// hack, force-include the header (nvcc doesn't include it by default) but
+// redefine dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are
+// only used here for the redeclarations of blockDim and threadIdx.)
 #pragma push_macro("dim3")
 #pragma push_macro("uint3")
 #define dim3 __cuda_builtin_blockDim_t
diff --git a/lib/Headers/__wmmintrin_aes.h b/lib/Headers/__wmmintrin_aes.h
index 211518eb2884..3a2ee1b2ef2e 100644
--- a/lib/Headers/__wmmintrin_aes.h
+++ b/lib/Headers/__wmmintrin_aes.h
@@ -35,7 +35,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESENC instruction.
+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
@@ -55,7 +55,7 @@ _mm_aesenc_si128(__m128i __V, __m128i __R)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESENCLAST instruction.
+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
@@ -75,7 +75,7 @@ _mm_aesenclast_si128(__m128i __V, __m128i __R)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESDEC instruction.
+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
@@ -95,7 +95,7 @@ _mm_aesdec_si128(__m128i __V, __m128i __R)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESDECLAST instruction.
+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
@@ -114,7 +114,7 @@ _mm_aesdeclast_si128(__m128i __V, __m128i __R)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESIMC instruction.
+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the expanded key.
@@ -136,7 +136,7 @@ _mm_aesimc_si128(__m128i __V)
 /// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c AESKEYGENASSIST instruction.
+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
 ///
 /// \param C
 ///    A 128-bit integer vector that is used to generate the AES encryption key.
diff --git a/lib/Headers/__wmmintrin_pclmul.h b/lib/Headers/__wmmintrin_pclmul.h
index d4e073f40688..e9c6a9f6d415 100644
--- a/lib/Headers/__wmmintrin_pclmul.h
+++ b/lib/Headers/__wmmintrin_pclmul.h
@@ -34,7 +34,7 @@
 /// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPCLMULQDQ instruction.
+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
 ///
 /// \param __X
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
@@ -42,13 +42,12 @@
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
 /// \param __I
 ///    An immediate value specifying which 64-bit values to select from the
-///    operands.
-///    Bit 0 is used to select a value from operand __X,
-///    and bit 4 is used to select a value from operand __Y:
-///    Bit[0]=0 indicates that bits[63:0] of operand __X are used.
-///    Bit[0]=1 indicates that bits[127:64] of operand __X are used.
-///    Bit[4]=0 indicates that bits[63:0] of operand __Y are used.
-///    Bit[4]=1 indicates that bits[127:64] of operand __Y are used.
+///    operands. Bit 0 is used to select a value from operand \a __X, and bit
+///    4 is used to select a value from operand \a __Y: \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
+///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
 /// \returns The 128-bit integer vector containing the result of the carry-less
 ///    multiplication of the selected 64-bit values.
 #define _mm_clmulepi64_si128(__X, __Y, __I) \
diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index 74a1914ce83b..d1d1d8026325 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h
@@ -34,8 +34,31 @@
 #define __CR6_LT 2
 #define __CR6_LT_REV 3
 
+/* Constants for vec_test_data_class */
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \
+                                  __VEC_CLASS_FP_SUBNORMAL_N)
+#define __VEC_CLASS_FP_ZERO_N (1<<2)
+#define __VEC_CLASS_FP_ZERO_P (1<<3)
+#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P           | \
+                             __VEC_CLASS_FP_ZERO_N)
+#define __VEC_CLASS_FP_INFINITY_N (1<<4)
+#define __VEC_CLASS_FP_INFINITY_P (1<<5)
+#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P   | \
+                                 __VEC_CLASS_FP_INFINITY_N)
+#define __VEC_CLASS_FP_NAN (1<<6)
+#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN        | \
+                                   __VEC_CLASS_FP_SUBNORMAL  | \
+                                   __VEC_CLASS_FP_ZERO       | \
+                                   __VEC_CLASS_FP_INFINITY)
+
 #define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
 
+#ifdef __POWER9_VECTOR__
+#include <stddef.h>
+#endif
+
 static __inline__ vector signed char __ATTRS_o_ai vec_perm(
     vector signed char __a, vector signed char __b, vector unsigned char __c);
 
@@ -134,7 +157,7 @@ static __inline__ vector float __ATTRS_o_ai vec_abs(vector float __a) {
 #endif
 }
 
-#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+#ifdef __VSX__
 static __inline__ vector double __ATTRS_o_ai vec_abs(vector double __a) {
   return __builtin_vsx_xvabsdp(__a);
 }
@@ -163,6 +186,26 @@ vec_abss(vector signed int __a) {
       __a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
 }
 
+/* vec_absd */
+#if defined(__POWER9_VECTOR__)
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_absd(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vabsdub(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_absd(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vabsduh(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_absd(vector unsigned int __a,  vector unsigned int __b) {
+  return __builtin_altivec_vabsduw(__a, __b);
+}
+
+#endif /* End __POWER9_VECTOR__ */
+
 /* vec_add */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -305,6 +348,22 @@ vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b,
 }
 #endif
 
+static __inline__ vector signed int __ATTRS_o_ai
+vec_adde(vector signed int __a, vector signed int __b,
+         vector signed int __c) {
+  vector signed int __mask = {1, 1, 1, 1};
+  vector signed int __carry = __c & __mask;
+  return vec_add(vec_add(__a, __b), __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adde(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int __mask = {1, 1, 1, 1};
+  vector unsigned int __carry = __c & __mask;
+  return vec_add(vec_add(__a, __b), __carry);
+}
+
 /* vec_addec */
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
@@ -319,6 +378,50 @@ vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b,
           vector unsigned __int128 __c) {
   return __builtin_altivec_vaddecuq(__a, __b, __c);
 }
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_addec(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+
+  signed int __result[4];
+  for (int i = 0; i < 4; i++) {
+    unsigned int __tempa = (unsigned int) __a[i];
+    unsigned int __tempb = (unsigned int) __b[i];
+    unsigned int __tempc = (unsigned int) __c[i];
+    __tempc = __tempc & 0x00000001;
+    unsigned long long __longa = (unsigned long long) __tempa;
+    unsigned long long __longb = (unsigned long long) __tempb;
+    unsigned long long __longc = (unsigned long long) __tempc;
+    unsigned long long __sum = __longa + __longb + __longc;
+    unsigned long long __res = (__sum >> 32) & 0x01;
+    unsigned long long __tempres = (unsigned int) __res;
+    __result[i] = (signed int) __tempres;
+  }
+
+  vector signed int ret = { __result[0], __result[1], __result[2], __result[3] };
+  return ret;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_addec(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+
+  unsigned int __result[4];
+  for (int i = 0; i < 4; i++) {
+    unsigned int __tempc = __c[i] & 1;
+    unsigned long long __longa = (unsigned long long) __a[i];
+    unsigned long long __longb = (unsigned long long) __b[i];
+    unsigned long long __longc = (unsigned long long) __tempc;
+    unsigned long long __sum = __longa + __longb + __longc;
+    unsigned long long __res = (__sum >> 32) & 0x01;
+    unsigned long long __tempres = (unsigned int) __res;
+    __result[i] = (signed int) __tempres;
+  }
+
+  vector unsigned int ret = { __result[0], __result[1], __result[2], __result[3] };
+  return ret;
+}
+
 #endif
 
 /* vec_vaddubm */
@@ -1544,6 +1647,12 @@ vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
                                                       (vector char)__b);
 }
 
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
 static __inline__ vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
                                                            vector short __b) {
   return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
@@ -1555,6 +1664,12 @@ vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
                                                        (vector short)__b);
 }
 
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+                                                       (vector short)__b);
+}
+
 static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a,
                                                          vector int __b) {
   return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
@@ -1566,6 +1681,12 @@ vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
                                                      (vector int)__b);
 }
 
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector bool int __a,
+                                                         vector bool int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+                                                     (vector int)__b);
+}
+
 #ifdef __POWER8_VECTOR__
 static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpeq(vector signed long long __a, vector signed long long __b) {
@@ -1577,6 +1698,13 @@ vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)__builtin_altivec_vcmpequd(
       (vector long long)__a, (vector long long)__b);
 }
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(
+      (vector long long)__a, (vector long long)__b);
+}
+
 #endif
 
 static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
@@ -1595,6 +1723,199 @@ vec_cmpeq(vector double __a, vector double __b) {
 }
 #endif
 
+#ifdef __POWER9_VECTOR__
+/* vec_cmpne */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector double __a, vector double __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+/* vec_cmpnez */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector signed short __a, vector signed short __b) {
+  return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector signed int __a, vector signed int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vctzlsbb(__a);
+#else
+  return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vctzlsbb(__a);
+#else
+  return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclzlsbb(__a);
+#else
+  return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclzlsbb(__a);
+#else
+  return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned int __a) {
+  return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector signed int __a) {
+  return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned __int128 __a) {
+  return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector signed __int128 __a) {
+  return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned long long __a) {
+  return __builtin_altivec_vprtybd(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector signed long long __a) {
+  return __builtin_altivec_vprtybd(__a);
+}
+
+#endif
+
 /* vec_cmpgt */
 
 static __inline__ vector bool char __ATTRS_o_ai
@@ -1882,6 +2203,41 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_cmpgt(__b, __a);
 }
 
+/* vec_popcnt */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_popcnt(vector signed char __a) {
+  return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_popcnt(vector signed short __a) {
+  return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_popcnt(vector signed int __a) {
+  return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_popcnt(vector signed long long __a) {
+  return __builtin_altivec_vpopcntd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_altivec_vpopcntd(__a);
+}
+
 /* vec_cntlz */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -1918,6 +2274,603 @@ vec_cntlz(vector unsigned long long __a) {
 }
 #endif
 
+#ifdef __POWER9_VECTOR__
+
+/* vec_cnttz */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_cnttz(vector signed char __a) {
+  return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_cnttz(vector signed short __a) {
+  return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_cnttz(vector signed int __a) {
+  return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cnttz(vector signed long long __a) {
+  return __builtin_altivec_vctzd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_altivec_vctzd(__a);
+}
+
+/* vec_first_match_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed char __a, vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned char __a, vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed short __a, vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_match_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed char __a, vector signed char __b) {
+  /* Compare the result of the comparison of two vectors with either and OR the
+     result. Either the elements are equal or one will equal the comparison
+     result if either is zero.
+  */
+  vector bool char __tmp1 = vec_cmpeq(__a, __b);
+  vector bool char __tmp2 = __tmp1 |
+                            vec_cmpeq((vector signed char)__tmp1, __a) |
+                            vec_cmpeq((vector signed char)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned char __a,
+                             vector unsigned char __b) {
+  vector bool char __tmp1 = vec_cmpeq(__a, __b);
+  vector bool char __tmp2 = __tmp1 |
+                            vec_cmpeq((vector unsigned char)__tmp1, __a) |
+                            vec_cmpeq((vector unsigned char)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed short __a, vector signed short __b) {
+  vector bool short __tmp1 = vec_cmpeq(__a, __b);
+  vector bool short __tmp2 = __tmp1 |
+                             vec_cmpeq((vector signed short)__tmp1, __a) |
+                             vec_cmpeq((vector signed short)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned short __a,
+                             vector unsigned short __b) {
+  vector bool short __tmp1 = vec_cmpeq(__a, __b);
+  vector bool short __tmp2 = __tmp1 |
+                             vec_cmpeq((vector unsigned short)__tmp1, __a) |
+                             vec_cmpeq((vector unsigned short)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed int __a, vector signed int __b) {
+  vector bool int __tmp1 = vec_cmpeq(__a, __b);
+  vector bool int __tmp2 = __tmp1 | vec_cmpeq((vector signed int)__tmp1, __a) |
+                           vec_cmpeq((vector signed int)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned int __a, vector unsigned int __b) {
+  vector bool int __tmp1 = vec_cmpeq(__a, __b);
+  vector bool int __tmp2 = __tmp1 |
+                           vec_cmpeq((vector unsigned int)__tmp1, __a) |
+                           vec_cmpeq((vector unsigned int)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)__tmp2);
+#else
+    vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed char __a, vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned char __a, vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed short __a, vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed char __a,
+                                vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned char __a,
+                                vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed short __a,
+                                vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned short __a,
+                                vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned int __a,
+                                vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ vector double  __ATTRS_o_ai
+vec_insert_exp(vector double __a, vector unsigned long long __b) {
+  return __builtin_vsx_xviexpdp((vector unsigned long long)__a,__b);
+}
+
+static __inline__ vector double  __ATTRS_o_ai
+vec_insert_exp(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_vsx_xviexpdp(__a,__b);
+}
+
+static __inline__ vector float  __ATTRS_o_ai
+vec_insert_exp(vector float __a, vector unsigned int __b) {
+  return __builtin_vsx_xviexpsp((vector unsigned int)__a,__b);
+}
+
+static __inline__ vector float  __ATTRS_o_ai
+vec_insert_exp(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_vsx_xviexpsp(__a,__b);
+}
+
+#if defined(__powerpc64__)
+static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a,
+                                                             size_t __b) {
+  return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_len(unsigned char *__a, size_t __b) {
+  return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a,
+                                                              size_t __b) {
+  return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_len(unsigned short *__a, size_t __b) {
+  return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a,
+                                                            size_t __b) {
+  return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a,
+                                                              size_t __b) {
+  return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) {
+  return (vector float)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_len(signed __int128 *__a, size_t __b) {
+  return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_len(unsigned __int128 *__a, size_t __b) {
+  return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_len(signed long long *__a, size_t __b) {
+  return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_len(unsigned long long *__a, size_t __b) {
+  return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
+                                                        size_t __b) {
+  return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len_r(unsigned char *__a,
+                                                          size_t __b) {
+  vector unsigned char __res =
+      (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __mask =
+      (vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
+  __res = (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__res, (vector int)__res, __mask);
+#endif
+  return __res;
+}
+
+// vec_xst_len
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned char __a,
+                                                unsigned char *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed char __a,
+                                                signed char *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed short __a,
+                                                signed short *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned short __a,
+                                                unsigned short *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed int __a,
+                                                signed int *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned int __a,
+                                                unsigned int *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector float __a, float *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed __int128 __a,
+                                                signed __int128 *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned __int128 __a,
+                                                unsigned __int128 *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed long long __a,
+                                                signed long long *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned long long __a,
+                                                unsigned long long *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
+                                                  unsigned char *__b,
+                                                  size_t __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __mask =
+      (vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
+  vector unsigned char __res =
+      __builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
+  return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
+#else
+  return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56));
+#endif
+}
+#endif
+#endif
+
 /* vec_cpsgn */
 
 #ifdef __VSX__
@@ -2016,20 +2969,284 @@ vec_vctuxs(vector float __a, int __b) {
   return __builtin_altivec_vctuxs(__a, __b);
 }
 
+/* vec_signed */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sld(vector signed int, vector signed int, unsigned const int __c);
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signed(vector float __a) {
+  return __builtin_convertvector(__a, vector signed int);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_signed(vector double __a) {
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_signed2(vector double __a, vector double __b) {
+  return (vector signed int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpsxws(__a);
+#endif
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpsxws(__a);
+#else
+  vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_unsigned */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sld(vector unsigned int, vector unsigned int, unsigned const int __c);
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsigned(vector float __a) {
+  return __builtin_convertvector(__a, vector unsigned int);
+}
+
+#ifdef __VSX__
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_unsigned(vector double __a) {
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_unsigned2(vector double __a, vector double __b) {
+  return (vector unsigned int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpuxws(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpuxws(__a);
+#else
+  vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_float */
+
+static __inline__ vector float __ATTRS_o_ai
+vec_sld(vector float, vector float, unsigned const int __c);
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector signed int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector unsigned int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector signed long long __a, vector signed long long __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector double __a, vector double __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvsxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvuxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxdsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxdsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
 /* vec_double */
 
 #ifdef __VSX__
 static __inline__ vector double __ATTRS_o_ai
 vec_double(vector signed long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector unsigned long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvsxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvuxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvspdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector signed int __a) {
   vector double __ret = {__a[0], __a[1]};
   return __ret;
 }
 
 static __inline__ vector double __ATTRS_o_ai
-vec_double(vector unsigned long long __a) {
+vec_doubleh(vector unsigned int __a) {
   vector double __ret = {__a[0], __a[1]};
   return __ret;
 }
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector float __a) {
+  vector double __ret = {__a[0], __a[1]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector signed int __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector unsigned int __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector float __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxwdp(__a);
+#else
+  return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxwdp(__a);
+#else
+  return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvspdp(__a);
+#else
+  return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#endif
+}
 #endif
 
 /* vec_div */
@@ -3835,6 +5052,34 @@ vec_mergee(vector unsigned int __a, vector unsigned int __b) {
                                          0x18, 0x19, 0x1A, 0x1B));
 }
 
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergee(vector bool long long __a, vector bool long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergee(vector signed long long __a, vector signed long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergee(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergee(vector float __a, vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergee(vector double __a, vector double __b) {
+  return vec_mergeh(__a, __b);
+}
+
 /* vec_mergeo */
 
 static __inline__ vector bool int __ATTRS_o_ai vec_mergeo(vector bool int __a,
@@ -3861,6 +5106,34 @@ vec_mergeo(vector unsigned int __a, vector unsigned int __b) {
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergeo(vector bool long long __a, vector bool long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeo(vector signed long long __a, vector signed long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeo(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergeo(vector float __a, vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeo(vector double __a, vector double __b) {
+  return vec_mergel(__a, __b);
+}
+
 #endif
 
 /* vec_mfvscr */
@@ -4689,6 +5962,12 @@ static __inline__ vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
   return ~(__a & __b);
 }
 
+static __inline__ vector float __ATTRS_o_ai
+vec_nand(vector float __a, vector float __b) {
+  return (vector float)(~((vector unsigned int)__a &
+                          (vector unsigned int)__b));
+}
+
 static __inline__ vector signed long long __ATTRS_o_ai
 vec_nand(vector signed long long __a, vector signed long long __b) {
   return ~(__a & __b);
@@ -4724,6 +6003,12 @@ vec_nand(vector bool long long __a, vector bool long long __b) {
   return ~(__a & __b);
 }
 
+static __inline__ vector double __ATTRS_o_ai
+vec_nand(vector double __a, vector double __b) {
+  return (vector double)(~((vector unsigned long long)__a &
+                           (vector unsigned long long)__b));
+}
+
 #endif
 
 /* vec_nmadd */
@@ -5195,6 +6480,16 @@ static __inline__ vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
   return __a | ~__b;
 }
 
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector bool int __a, vector float __b) {
+ return (vector float)(__a | ~(vector unsigned int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector float __a, vector bool int __b) {
+  return (vector float)((vector unsigned int)__a | ~__b);
+}
+
 static __inline__ vector signed long long __ATTRS_o_ai
 vec_orc(vector signed long long __a, vector signed long long __b) {
   return __a | ~__b;
@@ -5229,6 +6524,16 @@ static __inline__ vector bool long long __ATTRS_o_ai
 vec_orc(vector bool long long __a, vector bool long long __b) {
   return __a | ~__b;
 }
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a | ~__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector double __b) {
+  return (vector double)(__a | ~(vector unsigned long long)__b);
+}
 #endif
 
 /* vec_vor */
@@ -5536,8 +6841,25 @@ vec_pack(vector bool long long __a, vector bool long long __b) {
 #endif
 }
 
+static __inline__ vector float __ATTRS_o_ai
+vec_pack(vector double __a, vector double __b) {
+  return (vector float) (__a[0], __a[1], __b[0], __b[1]);
+}
 #endif
 
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_pack_to_short_fp32(vector float __a, vector float __b) {
+  vector float __resa = __builtin_vsx_xvcvsphp(__a);
+  vector float __resb = __builtin_vsx_xvcvsphp(__b);
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_mergee(__resa, __resb);
+#else
+  return (vector unsigned short)vec_mergeo(__resa, __resb);
+#endif
+}
+
+#endif
 /* vec_vpkuhum */
 
 #define __builtin_altivec_vpkuhum vec_vpkuhum
@@ -6324,6 +7646,34 @@ vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
 }
 #endif
 
+/* vec_rlmi */
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlmi(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vrlwmi(__a, __c, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlmi(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned long long __c) {
+  return __builtin_altivec_vrldmi(__a, __c, __b);
+}
+
+/* vec_rlnm */
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlnm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vrlwnm(__a, __b) & __c;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlnm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned long long __c) {
+  return __builtin_altivec_vrldnm(__a, __b) & __c;
+}
+#endif
+
 /* vec_vrlb */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -6984,6 +8334,145 @@ static __inline__ vector float __ATTRS_o_ai vec_sld(vector float __a,
 #endif
 }
 
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_sld(vector bool long long __a, vector bool long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sld(vector signed long long __a, vector signed long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sld(vector unsigned long long __a, vector unsigned long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sld(vector double __a,
+                                                     vector double __b,
+                                                     unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+#endif
+
+/* vec_sldw */
+static __inline__ vector signed char __ATTRS_o_ai vec_sldw(
+    vector signed char __a, vector signed char __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sldw(vector unsigned char __a, vector unsigned char __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_sldw(
+    vector signed short __a, vector signed short __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sldw(vector unsigned short __a, vector unsigned short __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sldw(vector signed int __a, vector signed int __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sldw(
+    vector unsigned int __a, vector unsigned int __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sldw(vector signed long long __a, vector signed long long __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+/* vec_slv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slv(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vslv(__a, __b);
+}
+
+/* vec_srv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srv(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsrv(__a, __b);
+}
+#endif
+
 /* vec_vsldoi */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -7307,6 +8796,20 @@ vec_sll(vector bool int __a, vector unsigned int __b) {
                                                 (vector int)__b);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsl((vector int)__a,
+                                                        (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsl((vector int)__a,
+                                                          (vector int)__b);
+}
+#endif
+
 /* vec_vsl */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -7570,6 +9073,32 @@ static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a,
   return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector signed char __b) {
+  return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector signed char __b) {
+  return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+                                                           (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+                                                           (vector int)__b);
+}
+#endif
+
 /* vec_vslo */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -8304,6 +9833,20 @@ vec_srl(vector bool int __a, vector unsigned int __b) {
                                                 (vector int)__b);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsr((vector int)__a,
+                                                        (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsr((vector int)__a,
+                                                          (vector int)__b);
+}
+#endif
+
 /* vec_vsr */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -8567,6 +10110,32 @@ static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a,
   return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector signed char __b) {
+  return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector signed char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+                                                           (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+                                                           (vector int)__b);
+}
+#endif
+
 /* vec_vsro */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -9580,6 +11149,12 @@ vec_vsubfp(vector float __a, vector float __b) {
 
 /* vec_subc */
 
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subc(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vsubcuw((vector unsigned int)__a,
+                                                      (vector unsigned int) __b);
+}
+
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_subc(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vsubcuw(__a, __b);
@@ -9813,6 +11388,7 @@ vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
 
 /* vec_vsubeuqm */
 
+
 static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
              vector signed __int128 __c) {
@@ -9825,6 +11401,18 @@ vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
   return __builtin_altivec_vsubeuqm(__a, __b, __c);
 }
 
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sube(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sube(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
 /* vec_vsubcuq */
 
 static __inline__ vector signed __int128 __ATTRS_o_ai
@@ -9850,8 +11438,47 @@ vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
              vector unsigned __int128 __c) {
   return __builtin_altivec_vsubecuq(__a, __b, __c);
 }
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subec(vector signed int __a, vector signed int __b,
+             vector signed int __c) {
+  return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subec(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c) {
+  return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_subec(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_subec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
 #endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sube(vector signed int __a, vector signed int __b,
+         vector signed int __c) {
+  vector signed int __mask = {1, 1, 1, 1};
+  vector signed int __carry = __c & __mask;
+  return vec_adde(__a, ~__b, __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sube(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int __mask = {1, 1, 1, 1};
+  vector unsigned int __carry = __c & __mask;
+  return vec_adde(__a, ~__b, __carry);
+}
 /* vec_sum4s */
 
 static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
@@ -10051,6 +11678,11 @@ vec_unpackh(vector bool int __a) {
   return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
 #endif
 }
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackh(vector float __a) {
+  return (vector double)(__a[0], __a[1]);
+}
 #endif
 
 /* vec_vupkhsb */
@@ -10185,6 +11817,11 @@ vec_unpackl(vector bool int __a) {
   return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
 #endif
 }
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackl(vector float __a) {
+  return (vector double)(__a[2], __a[3]);
+}
 #endif
 
 /* vec_vupklsb */
@@ -10935,6 +12572,55 @@ static __inline__ float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
   return __a[__b];
 }
 
+#ifdef __POWER9_VECTOR__
+
+/* vec_extract_exp */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_exp(vector float __a) {
+  return __builtin_vsx_xvxexpsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_exp(vector double __a) {
+  return __builtin_vsx_xvxexpdp(__a);
+}
+
+/* vec_extract_sig */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_sig(vector float __a) {
+  return __builtin_vsx_xvxsigsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_sig (vector double __a) {
+  return __builtin_vsx_xvxsigdp(__a);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shorth(vector unsigned short __a) {
+  vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+            __builtin_shufflevector(__a, __a, 0, -1, 1, -1, 2, -1, 3, -1);
+#else
+            __builtin_shufflevector(__a, __a, -1, 0, -1, 1, -1, 2, -1, 3);
+#endif
+  return __builtin_vsx_xvcvhpsp(__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shortl(vector unsigned short __a) {
+  vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+            __builtin_shufflevector(__a, __a, 4, -1, 5, -1, 6, -1, 7, -1);
+#else
+            __builtin_shufflevector(__a, __a, -1, 4, -1, 5, -1, 6, -1, 7);
+#endif
+  return __builtin_vsx_xvcvhpsp(__b);
+}
+#endif /* __POWER9_VECTOR__ */
+
 /* vec_insert */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -14369,6 +16055,24 @@ __builtin_crypto_vncipherlast(vector unsigned long long __a,
 #endif
 
 #ifdef __POWER8_VECTOR__
+static __inline__ vector bool char __ATTRS_o_ai
+vec_permxor(vector bool char __a, vector bool char __b,
+            vector bool char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_permxor(vector signed char __a, vector signed char __b,
+            vector signed char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_permxor(vector unsigned char __a, vector unsigned char __b,
+            vector unsigned char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
 static __inline__ vector unsigned char __ATTRS_o_ai
 __builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b,
                           vector unsigned char __c) {
@@ -14453,6 +16157,572 @@ vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
 #endif
 #endif
 
+
+/* vec_reve */
+
+static inline __ATTRS_o_ai vector bool char vec_reve(vector bool char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed char vec_reve(vector signed char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_reve(vector unsigned char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool int vec_reve(vector bool int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_reve(vector signed int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_reve(vector unsigned int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool short vec_reve(vector bool short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_reve(vector signed short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_reve(vector unsigned short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector float vec_reve(vector float __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector bool long long
+vec_reve(vector bool long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_reve(vector signed long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_reve(vector unsigned long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector double vec_reve(vector double __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+#endif
+
+/* vec_revb */
+static __inline__ vector bool char __ATTRS_o_ai
+vec_revb(vector bool char __a) {
+  return __a;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_revb(vector signed char __a) {
+  return __a;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_revb(vector unsigned char __a) {
+  return __a;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_revb(vector bool short __a) {
+  vector unsigned char __indices =
+      { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_revb(vector signed short __a) {
+  vector unsigned char __indices =
+      { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_revb(vector unsigned short __a) {
+  vector unsigned char __indices =
+     { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_revb(vector bool int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_revb(vector signed int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_revb(vector unsigned int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_revb(vector float __a) {
+ vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return vec_perm(__a, __a, __indices);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_revb(vector bool long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_revb(vector signed long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_revb(vector unsigned long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_revb(vector double __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+#endif /* End __VSX__ */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_revb(vector signed __int128 __a) {
+  vector unsigned char __indices =
+      { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  return (vector signed __int128)vec_perm((vector signed int)__a,
+                                          (vector signed int)__a,
+                                           __indices);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_revb(vector unsigned __int128 __a) {
+  vector unsigned char __indices =
+      { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  return (vector unsigned __int128)vec_perm((vector signed int)__a,
+                                            (vector signed int)__a,
+                                             __indices);
+}
+#endif /* END __POWER8_VECTOR__ && __powerpc64__ */
+
+/* vec_xl */
+
+static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset,
+                                                     signed char *__ptr) {
+  return *(vector signed char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xl(signed long long __offset, unsigned char *__ptr) {
+  return *(vector unsigned char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
+                                                      signed short *__ptr) {
+  return *(vector signed short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xl(signed long long __offset, unsigned short *__ptr) {
+  return *(vector unsigned short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
+                                                    signed int *__ptr) {
+  return *(vector signed int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
+                                                      unsigned int *__ptr) {
+  return *(vector unsigned int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
+                                               float *__ptr) {
+  return *(vector float *)(__ptr + __offset);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector signed long long
+vec_xl(signed long long __offset, signed long long *__ptr) {
+  return *(vector signed long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xl(signed long long __offset, unsigned long long *__ptr) {
+  return *(vector unsigned long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
+                                                double *__ptr) {
+  return *(vector double *)(__ptr + __offset);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai vector signed __int128
+vec_xl(signed long long __offset, signed __int128 *__ptr) {
+  return *(vector signed __int128 *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned __int128
+vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
+  return *(vector unsigned __int128 *)(__ptr + __offset);
+}
+#endif
+
+/* vec_xl_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed char *__ptr) {
+  vector signed char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                                 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned char *__ptr) {
+  vector unsigned char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                                 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector signed short  __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed short *__ptr) {
+  vector signed short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned short *__ptr) {
+  vector unsigned short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed int *__ptr) {
+  return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned int *__ptr) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, float *__ptr) {
+  return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed long long *__ptr) {
+  return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned long long *__ptr) {
+  return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, double *__ptr) {
+  return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed __int128 *__ptr) {
+  return vec_xl(__offset, __ptr);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned __int128 *__ptr) {
+  return vec_xl(__offset, __ptr);
+}
+#endif
+#else
+  #define vec_xl_be vec_xl
+#endif
+
+/* vec_xst */
+
+static inline __ATTRS_o_ai void vec_xst(vector signed char __vec,
+                                        signed long long __offset,
+                                        signed char *__ptr) {
+  *(vector signed char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned char __vec,
+                                        signed long long __offset,
+                                        unsigned char *__ptr) {
+  *(vector unsigned char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed short __vec,
+                                        signed long long __offset,
+                                        signed short *__ptr) {
+  *(vector signed short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec,
+                                        signed long long __offset,
+                                        unsigned short *__ptr) {
+  *(vector unsigned short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed int __vec,
+                                        signed long long __offset,
+                                        signed int *__ptr) {
+  *(vector signed int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned int __vec,
+                                        signed long long __offset,
+                                        unsigned int *__ptr) {
+  *(vector unsigned int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector float __vec,
+                                        signed long long __offset,
+                                        float *__ptr) {
+  *(vector float *)(__ptr + __offset) = __vec;
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec,
+                                        signed long long __offset,
+                                        signed long long *__ptr) {
+  *(vector signed long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec,
+                                        signed long long __offset,
+                                        unsigned long long *__ptr) {
+  *(vector unsigned long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector double __vec,
+                                        signed long long __offset,
+                                        double *__ptr) {
+  *(vector double *)(__ptr + __offset) = __vec;
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec,
+                                        signed long long __offset,
+                                        signed __int128 *__ptr) {
+  *(vector signed __int128 *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec,
+                                        signed long long __offset,
+                                        unsigned __int128 *__ptr) {
+  *(vector unsigned __int128 *)(__ptr + __offset) = __vec;
+}
+#endif
+
+/* vec_xst_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed char __vec,
+                                               signed long long  __offset,
+                                               signed char *__ptr) {
+  vector signed char __tmp =
+     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                             13, 12, 11, 10, 9, 8);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
+                                               signed long long  __offset,
+                                               unsigned char *__ptr) {
+  vector unsigned char __tmp =
+     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                             13, 12, 11, 10, 9, 8);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
+                                               signed long long  __offset,
+                                               signed short *__ptr) {
+  vector signed short __tmp =
+     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
+                                               signed long long  __offset,
+                                               unsigned short *__ptr) {
+  vector unsigned short __tmp =
+     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
+                                               signed long long  __offset,
+                                               signed int *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned int __vec,
+                                               signed long long  __offset,
+                                               unsigned int *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector float __vec,
+                                               signed long long  __offset,
+                                               float *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed long long __vec,
+                                               signed long long  __offset,
+                                               signed long long *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned long long __vec,
+                                               signed long long  __offset,
+                                               unsigned long long *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector double __vec,
+                                               signed long long  __offset,
+                                               double *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed __int128 __vec,
+                                               signed long long  __offset,
+                                               signed __int128 *__ptr) {
+  vec_xst(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned __int128 __vec,
+                                               signed long long  __offset,
+                                               unsigned __int128 *__ptr) {
+  vec_xst(__vec, __offset, __ptr);
+}
+#endif
+#else
+  #define vec_xst_be vec_xst
+#endif
+
+#ifdef __POWER9_VECTOR__
+#define vec_test_data_class(__a, __b)                                      \
+        _Generic((__a),                                                    \
+           vector float:                                                   \
+             (vector bool int)__builtin_vsx_xvtstdcsp((__a), (__b)),       \
+           vector double:                                                  \
+             (vector bool long long)__builtin_vsx_xvtstdcdp((__a), (__b))  \
+        )
+
+#endif /* #ifdef __POWER9_VECTOR__ */
+
+static vector float __ATTRS_o_ai vec_neg(vector float __a) {
+  return -__a;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_neg(vector double __a) {
+  return -__a;
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_neg(vector long long __a) {
+  return -__a;
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_neg(vector signed int __a) {
+  return -__a;
+}
+
+static vector signed short __ATTRS_o_ai vec_neg(vector signed short __a) {
+  return -__a;
+}
+
+static vector signed char __ATTRS_o_ai vec_neg(vector signed char __a) {
+  return -__a;
+}
+
+static vector float __ATTRS_o_ai vec_nabs(vector float __a) {
+  return - vec_abs(__a);
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_nabs(vector double __a) {
+  return - vec_abs(__a);
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_nabs(vector long long __a) {
+  return __builtin_altivec_vminsd(__a, -__a);
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_nabs(vector signed int __a) {
+  return __builtin_altivec_vminsw(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai vec_nabs(vector signed short __a) {
+  return __builtin_altivec_vminsh(__a, -__a);
+}
+
+static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
+  return __builtin_altivec_vminsb(__a, -__a);
+}
 #undef __ATTRS_o_ai
 
 #endif /* __ALTIVEC_H */
diff --git a/lib/Headers/ammintrin.h b/lib/Headers/ammintrin.h
index 8985bb404f47..2843a7a2677f 100644
--- a/lib/Headers/ammintrin.h
+++ b/lib/Headers/ammintrin.h
@@ -30,7 +30,7 @@
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
 
 /// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
-///    integer vector operand at the index idx and of the length len.
+///    integer vector operand at the index \a idx and of the length \a len.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -38,7 +38,7 @@
 /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
 ///
 /// \param x
 ///    The value from which bits are extracted.
@@ -49,8 +49,8 @@
 ///    Bits [5:0] specify the index of the least significant bit; the other
 ///    bits are ignored. If the sum of the index and length is greater than 64,
 ///    the result is undefined. If the length and index are both zero, bits
-///    [63:0] of parameter x are extracted. If the length is zero but the index
-///    is non-zero, the result is undefined.
+///    [63:0] of parameter \a x are extracted. If the length is zero but the
+///    index is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector whose lower 64 bits contain the bits
 ///    extracted from the source operand.
 #define _mm_extracti_si64(x, len, idx) \
@@ -58,11 +58,12 @@
                                   (char)(len), (char)(idx)))
 
 /// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
-///    integer vector operand at the index and of the length specified by __y.
+///    integer vector operand at the index and of the length specified by
+///    \a __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
 ///
 /// \param __x
 ///    The value from which bits are extracted.
@@ -71,8 +72,8 @@
 ///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
 ///    length is interpreted as 64. If the sum of the index and length is
 ///    greater than 64, the result is undefined. If the length and index are
-///    both zero, bits [63:0] of parameter __x are extracted. If the length is
-///    zero but the index is non-zero, the result is undefined.
+///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
+///    is zero but the index is non-zero, the result is undefined.
 /// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
 ///    from the source operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -81,9 +82,9 @@ _mm_extract_si64(__m128i __x, __m128i __y)
   return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
 }
 
-/// \brief Inserts bits of a specified length from the source integer vector y
-///    into the lower 64 bits of the destination integer vector x at the index
-///    idx and of the length len.
+/// \brief Inserts bits of a specified length from the source integer vector
+///    \a y into the lower 64 bits of the destination integer vector \a x at
+///    the index \a idx and of the length \a len.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -92,15 +93,15 @@ _mm_extract_si64(__m128i __x, __m128i __y)
 /// const int idx);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
 ///
 /// \param x
 ///    The destination operand where bits will be inserted. The inserted bits
-///    are defined by the length len and by the index idx specifying the least
-///    significant bit.
+///    are defined by the length \a len and by the index \a idx specifying the
+///    least significant bit.
 /// \param y
 ///    The source operand containing the bits to be extracted. The extracted
-///    bits are the least significant bits of operand y of length len.
+///    bits are the least significant bits of operand \a y of length \a len.
 /// \param len
 ///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
 ///    are zero, the length is interpreted as 64.
@@ -108,45 +109,43 @@ _mm_extract_si64(__m128i __x, __m128i __y)
 ///    Bits [5:0] specify the index of the least significant bit; the other
 ///    bits are ignored. If the sum of the index and length is greater than 64,
 ///    the result is undefined. If the length and index are both zero, bits
-///    [63:0] of parameter y are inserted into parameter x. If the length is
-///    zero but the index is non-zero, the result is undefined.
+///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
+///    is zero but the index is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector containing the original lower 64-bits of
-///    destination operand x with the specified bitfields replaced by the lower
-///    bits of source operand y. The upper 64 bits of the return value are
-///    undefined.
-
+///    destination operand \a x with the specified bitfields replaced by the
+///    lower bits of source operand \a y. The upper 64 bits of the return value
+///    are undefined.
 #define _mm_inserti_si64(x, y, len, idx) \
   ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
                                     (__v2di)(__m128i)(y), \
                                     (char)(len), (char)(idx)))
 
 /// \brief Inserts bits of a specified length from the source integer vector
-///    __y into the lower 64 bits of the destination integer vector __x at the
-///    index and of the length specified by __y.
+///    \a __y into the lower 64 bits of the destination integer vector \a __x
+///    at the index and of the length specified by \a __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
 ///
 /// \param __x
 ///    The destination operand where bits will be inserted. The inserted bits
 ///    are defined by the length and by the index of the least significant bit
-///    specified by operand __y.
+///    specified by operand \a __y.
 /// \param __y
 ///    The source operand containing the bits to be extracted. The extracted
-///    bits are the least significant bits of operand __y with length specified
-///    by bits [69:64]. These are inserted into the destination at the index
-///    specified by bits [77:72]; all other bits are ignored. If bits [69:64]
-///    are zero, the length is interpreted as 64. If the sum of the index and
-///    length is greater than 64, the result is undefined. If the length and
-///    index are both zero, bits [63:0] of parameter __y are inserted into
-///    parameter __x. If the length is zero but the index is non-zero, the
-///    result is undefined.
+///    bits are the least significant bits of operand \a __y with length
+///    specified by bits [69:64]. These are inserted into the destination at the
+///    index specified by bits [77:72]; all other bits are ignored. If bits
+///    [69:64] are zero, the length is interpreted as 64. If the sum of the
+///    index and length is greater than 64, the result is undefined. If the
+///    length and index are both zero, bits [63:0] of parameter \a __y are
+///    inserted into parameter \a __x. If the length is zero but the index is
+///    non-zero, the result is undefined.
 /// \returns A 128-bit integer vector containing the original lower 64-bits of
-///    destination operand __x with the specified bitfields replaced by the
-///    lower bits of source operand __y. The upper 64 bits of the return value
-///    are undefined.
-
+///    destination operand \a __x with the specified bitfields replaced by the
+///    lower bits of source operand \a __y. The upper 64 bits of the return
+///    value are undefined.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_si64(__m128i __x, __m128i __y)
 {
@@ -159,7 +158,7 @@ _mm_insert_si64(__m128i __x, __m128i __y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MOVNTSD instruction.
+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
 ///
 /// \param __p
 ///    The 64-bit memory location used to store the register value.
@@ -177,7 +176,7 @@ _mm_stream_sd(double *__p, __m128d __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MOVNTSS instruction.
+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
 ///
 /// \param __p
 ///    The 32-bit memory location used to store the register value.
diff --git a/lib/Headers/armintr.h b/lib/Headers/armintr.h
new file mode 100644
index 000000000000..933afcbb91b6
--- /dev/null
+++ b/lib/Headers/armintr.h
@@ -0,0 +1,45 @@
+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <armintr.h>
+#else
+
+#ifndef __ARMINTR_H
+#define __ARMINTR_H
+
+typedef enum
+{
+  _ARM_BARRIER_SY    = 0xF,
+  _ARM_BARRIER_ST    = 0xE,
+  _ARM_BARRIER_ISH   = 0xB,
+  _ARM_BARRIER_ISHST = 0xA,
+  _ARM_BARRIER_NSH   = 0x7,
+  _ARM_BARRIER_NSHST = 0x6,
+  _ARM_BARRIER_OSH   = 0x3,
+  _ARM_BARRIER_OSHST = 0x2
+} _ARMINTR_BARRIER_TYPE;
+
+#endif /* __ARMINTR_H */
+#endif /* _MSC_VER */
diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h
index d3c5a6c96446..629dc8611a7f 100644
--- a/lib/Headers/avx512bwintrin.h
+++ b/lib/Headers/avx512bwintrin.h
@@ -350,19 +350,17 @@ _mm512_add_epi8 (__m512i __A, __m512i __B) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
-             (__v64qi) __B,
-             (__v64qi) __W,
-             (__mmask64) __U);
+_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_add_epi8(__A, __B),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
-             (__v64qi) __B,
-             (__v64qi) _mm512_setzero_qi(),
-             (__mmask64) __U);
+_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_add_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_qi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -371,19 +369,17 @@ _mm512_sub_epi8 (__m512i __A, __m512i __B) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
-             (__v64qi) __B,
-             (__v64qi) __W,
-             (__mmask64) __U);
+_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_sub_epi8(__A, __B),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
-             (__v64qi) __B,
-             (__v64qi) _mm512_setzero_qi(),
-             (__mmask64) __U);
+_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_sub_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_qi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -392,19 +388,17 @@ _mm512_add_epi16 (__m512i __A, __m512i __B) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
-             (__v32hi) __B,
-             (__v32hi) __W,
-             (__mmask32) __U);
+_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_add_epi16(__A, __B),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
-             (__v32hi) __B,
-             (__v32hi) _mm512_setzero_hi(),
-             (__mmask32) __U);
+_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_add_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -413,19 +407,17 @@ _mm512_sub_epi16 (__m512i __A, __m512i __B) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
-             (__v32hi) __B,
-             (__v32hi) __W,
-             (__mmask32) __U);
+_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_sub_epi16(__A, __B),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
-             (__v32hi) __B,
-             (__v32hi) _mm512_setzero_hi(),
-             (__mmask32) __U);
+_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_sub_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -434,19 +426,17 @@ _mm512_mullo_epi16 (__m512i __A, __m512i __B) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __U);
+_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1018,31 +1008,25 @@ _mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
+_mm512_shuffle_epi8(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
-        __m512i __B)
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) __W,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
+                                         (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
+                                         (__v64qi)_mm512_setzero_qi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1537,55 +1521,49 @@ _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi16 (__m256i __A)
+_mm512_cvtepi8_epi16(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
-                (__v32hi)
-                _mm512_setzero_hi (),
-                (__mmask32) -1);
+  /* This function always performs a signed extension, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
-                (__v32hi) __W,
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A)
+_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
-                (__v32hi)
-                _mm512_setzero_hi(),
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
+                                             (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi16 (__m256i __A)
+_mm512_cvtepu8_epi16(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
-                (__v32hi)
-                _mm512_setzero_hi (),
-                (__mmask32) -1);
+  return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
-                (__v32hi) __W,
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
+_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
-                (__v32hi)
-                _mm512_setzero_hi(),
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
+                                             (__v32hi)_mm512_setzero_hi());
 }
 
 
@@ -1704,79 +1682,70 @@ _mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
                                       (__v32hi)_mm512_setzero_hi()); })
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi16 (__m512i __A, __m512i __B)
+_mm512_sllv_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-      __m512i __B)
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
+                                           (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi16 (__m512i __A, __m128i __B)
+_mm512_sll_epi16(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-           __m128i __B)
+_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi) __W,
-             (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sll_epi16(__A, __B),
+                                          (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sll_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
 }
 
-#define _mm512_slli_epi16(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
+}
 
-#define _mm512_mask_slli_epi16(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)(__m512i)(W), \
-                                         (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_slli_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
 
-#define _mm512_maskz_slli_epi16(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_slli_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
 
 #define _mm512_bslli_epi128(a, imm) __extension__ ({ \
   (__m512i)__builtin_shufflevector(                                          \
@@ -1848,155 +1817,136 @@ _mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
        ((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); })
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srlv_epi16 (__m512i __A, __m512i __B)
+_mm512_srlv_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-      __m512i __B)
+_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
+                                           (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi16 (__m512i __A, __m512i __B)
+_mm512_srav_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-      __m512i __B)
+_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srav_epi16(__A, __B),
+                                           (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srav_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi16 (__m512i __A, __m128i __B)
+_mm512_sra_epi16(__m512i __A, __m128i __B)
 {
- return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-           __m128i __B)
+_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi) __W,
-            (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sra_epi16(__A, __B),
+                                          (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-            (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sra_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
 }
 
-#define _mm512_srai_epi16(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)-1); })
-
-#define _mm512_mask_srai_epi16(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)(__m512i)(W), \
-                                         (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+}
 
-#define _mm512_maskz_srai_epi16(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srai_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srai_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi16 (__m512i __A, __m128i __B)
+_mm512_srl_epi16(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-           __m128i __B)
+_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi) __W,
-             (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_srl_epi16(__A, __B),
+                                          (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_srl_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
 }
 
-#define _mm512_srli_epi16(A, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
+}
 
-#define _mm512_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
-                                         (__v32hi)(__m512i)(W), \
-                                         (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
 
-#define _mm512_maskz_srli_epi16(U, A, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
 
 #define _mm512_bsrli_epi128(a, imm) __extension__ ({ \
   (__m512i)__builtin_shufflevector(                     \
diff --git a/lib/Headers/avx512dqintrin.h b/lib/Headers/avx512dqintrin.h
index 13665e4c6668..ae44b98a9495 100644
--- a/lib/Headers/avx512dqintrin.h
+++ b/lib/Headers/avx512dqintrin.h
@@ -37,204 +37,169 @@ _mm512_mullo_epi64 (__m512i __A, __m512i __B) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di) __W,
-              (__mmask8) __U);
+_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullo_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) __U);
+_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullo_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_xor_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8du) __A ^ (__v8du) __B);
+_mm512_xor_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A ^ (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U);
+_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_xor_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U);
+_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_xor_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_xor_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16su) __A ^ (__v16su) __B);
+  return (__m512)((__v16su)__A ^ (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U);
+_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_xor_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U);
+_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_xor_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_or_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8du) __A | (__v8du) __B);
+_mm512_or_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A | (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
-            (__v8df) __B,
-            (__v8df) __W,
-            (__mmask8) __U);
+_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_or_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
-            (__v8df) __B,
-            (__v8df)
-            _mm512_setzero_pd (),
-            (__mmask8) __U);
+_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_or_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_or_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16su) __A | (__v16su) __B);
+_mm512_or_ps(__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A | (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
-                 (__v16sf) __B,
-                 (__v16sf) __W,
-                 (__mmask16) __U);
+_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_or_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
-                 (__v16sf) __B,
-                 (__v16sf)
-                 _mm512_setzero_ps (),
-                 (__mmask16) __U);
+_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_or_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_and_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8du) __A & (__v8du) __B);
+_mm512_and_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A & (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U);
+_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_and_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U);
+_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_and_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_and_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16su) __A & (__v16su) __B);
+_mm512_and_ps(__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A & (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U);
+_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_and_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U);
+_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_and_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_andnot_pd (__m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
-              (__v8df) __B,
-              (__v8df)
-              _mm512_setzero_pd (),
-              (__mmask8) -1);
+_mm512_andnot_pd(__m512d __A, __m512d __B) {
+  return (__m512d)(~(__v8du)__A & (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
-              (__v8df) __B,
-              (__v8df) __W,
-              (__mmask8) __U);
+_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_andnot_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
-              (__v8df) __B,
-              (__v8df)
-              _mm512_setzero_pd (),
-              (__mmask8) __U);
+_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_andnot_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_andnot_ps (__m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
-             (__v16sf) __B,
-             (__v16sf)
-             _mm512_setzero_ps (),
-             (__mmask16) -1);
+_mm512_andnot_ps(__m512 __A, __m512 __B) {
+  return (__m512)(~(__v16su)__A & (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
-             (__v16sf) __B,
-             (__v16sf) __W,
-             (__mmask16) __U);
+_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_andnot_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
-             (__v16sf) __B,
-             (__v16sf)
-             _mm512_setzero_ps (),
-             (__mmask16) __U);
+_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_andnot_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1151,148 +1116,184 @@ _mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
 }
 
 #define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1); })
+  (__m256)__builtin_shufflevector((__v16sf)(__m512)(A),           \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                  ((imm) & 1) ?  8 : 0,           \
+                                  ((imm) & 1) ?  9 : 1,           \
+                                  ((imm) & 1) ? 10 : 2,           \
+                                  ((imm) & 1) ? 11 : 3,           \
+                                  ((imm) & 1) ? 12 : 4,           \
+                                  ((imm) & 1) ? 13 : 5,           \
+                                  ((imm) & 1) ? 14 : 6,           \
+                                  ((imm) & 1) ? 15 : 7); })
 
 #define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v8sf)(__m256)(W), \
-                                           (__mmask8)(U)); })
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                   (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
+                                   (__v8sf)(W)); })
 
 #define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U)); })
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                   (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
+                                   (__v8sf)_mm256_setzero_ps()); })
 
 #define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1); })
+  (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A),          \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + ((imm) & 0x3) * 2,         \
+                                   1 + ((imm) & 0x3) * 2); })
 
 #define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U)); })
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)(W)); })
 
 #define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U)); })
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)_mm_setzero_pd()); })
 
 #define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)-1); })
+  (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A),             \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   ((imm) & 1) ?  8 : 0,              \
+                                   ((imm) & 1) ?  9 : 1,              \
+                                   ((imm) & 1) ? 10 : 2,              \
+                                   ((imm) & 1) ? 11 : 3,              \
+                                   ((imm) & 1) ? 12 : 4,              \
+                                   ((imm) & 1) ? 13 : 5,              \
+                                   ((imm) & 1) ? 14 : 6,              \
+                                   ((imm) & 1) ? 15 : 7); })
 
 #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v8si)(__m256i)(W), \
-                                            (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
+                                (__v8si)(W)); })
 
 #define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
+                                (__v8si)_mm256_setzero_si256()); })
 
 #define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_setzero_di(), \
-                                                (__mmask8)-1); })
+  (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A),          \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   0 + ((imm) & 0x3) * 2,           \
+                                   1 + ((imm) & 0x3) * 2); })
 
 #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)(__m128i)(W), \
-                                                (__mmask8)(U)); })
+  (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)(W)); })
 
 #define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_setzero_di(), \
-                                                (__mmask8)(U)); })
+  (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)_mm_setzero_di()); })
 
 #define _mm512_insertf32x8(A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
-                                          (__v8sf)(__m256)(B), (int)(imm), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1); })
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                  (__v16sf)_mm512_castps256_ps512((__m256)(B)),\
+                                  ((imm) & 0x1) ?  0 : 16, \
+                                  ((imm) & 0x1) ?  1 : 17, \
+                                  ((imm) & 0x1) ?  2 : 18, \
+                                  ((imm) & 0x1) ?  3 : 19, \
+                                  ((imm) & 0x1) ?  4 : 20, \
+                                  ((imm) & 0x1) ?  5 : 21, \
+                                  ((imm) & 0x1) ?  6 : 22, \
+                                  ((imm) & 0x1) ?  7 : 23, \
+                                  ((imm) & 0x1) ? 16 :  8, \
+                                  ((imm) & 0x1) ? 17 :  9, \
+                                  ((imm) & 0x1) ? 18 : 10, \
+                                  ((imm) & 0x1) ? 19 : 11, \
+                                  ((imm) & 0x1) ? 20 : 12, \
+                                  ((imm) & 0x1) ? 21 : 13, \
+                                  ((imm) & 0x1) ? 22 : 14, \
+                                  ((imm) & 0x1) ? 23 : 15); })
 
 #define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
-                                          (__v8sf)(__m256)(B), (int)(imm), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+                                 (__v16sf)(W)); })
 
 #define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
-                                          (__v8sf)(__m256)(B), (int)(imm), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+                                 (__v16sf)_mm512_setzero_ps()); })
 
 #define _mm512_insertf64x2(A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v8df)_mm512_setzero_pd(), \
-                                               (__mmask8)-1); })
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                  (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\
+                                  (((imm) & 0x3) == 0) ? 8 : 0, \
+                                  (((imm) & 0x3) == 0) ? 9 : 1, \
+                                  (((imm) & 0x3) == 1) ? 8 : 2, \
+                                  (((imm) & 0x3) == 1) ? 9 : 3, \
+                                  (((imm) & 0x3) == 2) ? 8 : 4, \
+                                  (((imm) & 0x3) == 2) ? 9 : 5, \
+                                  (((imm) & 0x3) == 3) ? 8 : 6, \
+                                  (((imm) & 0x3) == 3) ? 9 : 7); })
 
 #define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v8df)(__m512d)(W), \
-                                               (__mmask8)(U)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+                                  (__v8df)(W)); })
 
 #define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v8df)_mm512_setzero_pd(), \
-                                               (__mmask8)(U)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+                                  (__v8df)_mm512_setzero_pd()); })
 
 #define _mm512_inserti32x8(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
-                                           (__v8si)(__m256i)(B), (int)(imm), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)-1); })
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                 (__v16si)_mm512_castsi256_si512((__m256i)(B)),\
+                                 ((imm) & 0x1) ?  0 : 16, \
+                                 ((imm) & 0x1) ?  1 : 17, \
+                                 ((imm) & 0x1) ?  2 : 18, \
+                                 ((imm) & 0x1) ?  3 : 19, \
+                                 ((imm) & 0x1) ?  4 : 20, \
+                                 ((imm) & 0x1) ?  5 : 21, \
+                                 ((imm) & 0x1) ?  6 : 22, \
+                                 ((imm) & 0x1) ?  7 : 23, \
+                                 ((imm) & 0x1) ? 16 :  8, \
+                                 ((imm) & 0x1) ? 17 :  9, \
+                                 ((imm) & 0x1) ? 18 : 10, \
+                                 ((imm) & 0x1) ? 19 : 11, \
+                                 ((imm) & 0x1) ? 20 : 12, \
+                                 ((imm) & 0x1) ? 21 : 13, \
+                                 ((imm) & 0x1) ? 22 : 14, \
+                                 ((imm) & 0x1) ? 23 : 15); })
 
 #define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
-                                           (__v8si)(__m256i)(B), (int)(imm), \
-                                           (__v16si)(__m512i)(W), \
-                                           (__mmask16)(U)); })
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+                                 (__v16si)(W)); })
 
 #define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
-                                           (__v8si)(__m256i)(B), (int)(imm), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)(U)); })
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512()); })
 
 #define _mm512_inserti64x2(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8di)_mm512_setzero_si512(), \
-                                               (__mmask8)-1); })
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+                                  (__v8di)_mm512_castsi128_si512((__m128i)(B)),\
+                                  (((imm) & 0x3) == 0) ? 8 : 0, \
+                                  (((imm) & 0x3) == 0) ? 9 : 1, \
+                                  (((imm) & 0x3) == 1) ? 8 : 2, \
+                                  (((imm) & 0x3) == 1) ? 9 : 3, \
+                                  (((imm) & 0x3) == 2) ? 8 : 4, \
+                                  (((imm) & 0x3) == 2) ? 9 : 5, \
+                                  (((imm) & 0x3) == 3) ? 8 : 6, \
+                                  (((imm) & 0x3) == 3) ? 9 : 7); })
 
 #define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8di)(__m512i)(W), \
-                                               (__mmask8)(U)); })
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+                                  (__v8di)(W)); })
 
 #define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8di)_mm512_setzero_si512(), \
-                                               (__mmask8)(U)); })
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512()); })
 
 #define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
   (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index 0bf6582345d4..e6a7217c8967 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -54,6 +54,19 @@ typedef unsigned short __mmask16;
 #define _MM_FROUND_TO_ZERO          0x03
 #define _MM_FROUND_CUR_DIRECTION    0x04
 
+/* Constants for integer comparison predicates */
+typedef enum {
+    _MM_CMPINT_EQ,      /* Equal */
+    _MM_CMPINT_LT,      /* Less than */
+    _MM_CMPINT_LE,      /* Less than or Equal */
+    _MM_CMPINT_UNUSED,
+    _MM_CMPINT_NE,      /* Not Equal */
+    _MM_CMPINT_NLT,     /* Not Less than */
+#define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
+    _MM_CMPINT_NLE      /* Not Less than or Equal */
+#define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
+} _MM_CMPINT_ENUM;
+
 typedef enum
 {
   _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
@@ -503,6 +516,18 @@ _mm512_castsi512_si256 (__m512i __A)
   return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
 }
 
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_int2mask(int __a)
+{
+  return (__mmask16)__a;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask2int(__mmask16 __a)
+{
+  return (int)__a;
+}
+
 /* Bitwise operators */
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_epi32(__m512i __a, __m512i __b)
@@ -737,22 +762,19 @@ _mm512_add_epi64 (__m512i __A, __m512i __B)
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
-             (__v8di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_add_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
-             (__v8di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_add_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -762,22 +784,19 @@ _mm512_sub_epi64 (__m512i __A, __m512i __B)
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
-             (__v8di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sub_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
-             (__v8di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sub_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -787,22 +806,19 @@ _mm512_add_epi32 (__m512i __A, __m512i __B)
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
-             (__v16si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_add_epi32(__A, __B),
+                                             (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
-             (__v16si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_add_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -812,22 +828,19 @@ _mm512_sub_epi32 (__m512i __A, __m512i __B)
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
-             (__v16si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_sub_epi32(__A, __B),
+                                             (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
-             (__v16si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_sub_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
 #define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
@@ -1403,57 +1416,45 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mul_epi32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v8di) __W, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epi32(__X, __Y),
+                                             (__v8di)__W);
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epi32(__X, __Y),
+                                             (__v8di)_mm512_setzero_si512 ());
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mul_epu32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
-               (__v16si) __Y,
-               (__v8di)
-               _mm512_setzero_si512 (),
-               (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
-               (__v16si) __Y,
-               (__v8di) __W, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epu32(__X, __Y),
+                                             (__v8di)__W);
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
-               (__v16si) __Y,
-               (__v8di)
-               _mm512_setzero_si512 (),
-               __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epu32(__X, __Y),
+                                             (__v8di)_mm512_setzero_si512 ());
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
@@ -1463,21 +1464,19 @@ _mm512_mullo_epi32 (__m512i __A, __m512i __B)
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_mullo_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si) __W, __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_mullo_epi32(__A, __B),
+                                             (__v16si)__W);
 }
 
 #define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
@@ -1977,38 +1976,30 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_add_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) _mm512_setzero_pd (),
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_add_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_add_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) _mm512_setzero_ps (),
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_add_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 #define _mm512_add_round_pd(A, B, R) __extension__ ({ \
@@ -2120,40 +2111,30 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_sub_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_sub_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_sub_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_sub_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 #define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
@@ -2265,40 +2246,30 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_mul_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_mul_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_mul_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_mul_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 #define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
@@ -2417,21 +2388,16 @@ _mm512_div_pd(__m512d __a, __m512d __b)
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_div_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_div_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
@@ -2442,21 +2408,16 @@ _mm512_div_ps(__m512 __a, __m512 __b)
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_div_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_div_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 #define _mm512_div_round_pd(A, B, R) __extension__ ({ \
@@ -3443,71 +3404,94 @@ _mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
 }
 
 #define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
-                                         (__v8di)(__m512i)(B), (int)(I), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)-1); })
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
+                                   (__v8di)(__m512i)(A), \
+                                   ((int)(I) & 0x7) + 0, \
+                                   ((int)(I) & 0x7) + 1, \
+                                   ((int)(I) & 0x7) + 2, \
+                                   ((int)(I) & 0x7) + 3, \
+                                   ((int)(I) & 0x7) + 4, \
+                                   ((int)(I) & 0x7) + 5, \
+                                   ((int)(I) & 0x7) + 6, \
+                                   ((int)(I) & 0x7) + 7); })
 
 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
-  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
-                                         (__v8di)(__m512i)(B), (int)(imm), \
-                                         (__v8di)(__m512i)(W), \
-                                         (__mmask8)(U)); })
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                 (__v8di)(__m512i)(W)); })
 
 #define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
-  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
-                                         (__v8di)(__m512i)(B), (int)(imm), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)(U)); })
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                 (__v8di)_mm512_setzero_si512()); })
 
 #define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
-                                         (__v16si)(__m512i)(B), (int)(I), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
+                                   (__v16si)(__m512i)(A), \
+                                   ((int)(I) & 0xf) + 0, \
+                                   ((int)(I) & 0xf) + 1, \
+                                   ((int)(I) & 0xf) + 2, \
+                                   ((int)(I) & 0xf) + 3, \
+                                   ((int)(I) & 0xf) + 4, \
+                                   ((int)(I) & 0xf) + 5, \
+                                   ((int)(I) & 0xf) + 6, \
+                                   ((int)(I) & 0xf) + 7, \
+                                   ((int)(I) & 0xf) + 8, \
+                                   ((int)(I) & 0xf) + 9, \
+                                   ((int)(I) & 0xf) + 10, \
+                                   ((int)(I) & 0xf) + 11, \
+                                   ((int)(I) & 0xf) + 12, \
+                                   ((int)(I) & 0xf) + 13, \
+                                   ((int)(I) & 0xf) + 14, \
+                                   ((int)(I) & 0xf) + 15); })
 
 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
-  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
-                                         (__v16si)(__m512i)(B), (int)(imm), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                (__v16si)(__m512i)(W)); })
 
 #define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
-  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
-                                         (__v16si)(__m512i)(B), (int)(imm), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                (__v16si)_mm512_setzero_si512()); })
 /* Vector Extract */
 
-#define _mm512_extractf64x4_pd(A, I) __extension__ ({                    \
-  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
-                                            (__v4df)_mm256_setzero_si256(), \
-                                            (__mmask8)-1); })
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({             \
+  (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A),          \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   ((I) & 1) ? 4 : 0,             \
+                                   ((I) & 1) ? 5 : 1,             \
+                                   ((I) & 1) ? 6 : 2,             \
+                                   ((I) & 1) ? 7 : 3); })
 
 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
-  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                            (__v4df)(__m256d)(W), \
-                                            (__mmask8)(U)); })
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                   (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+                                   (__v4df)(W)); })
 
 #define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
-  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                            (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)(U)); })
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                   (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+                                   (__v4df)_mm256_setzero_pd()); })
 
-#define _mm512_extractf32x4_ps(A, I) __extension__ ({                    \
-  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1); })
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({             \
+  (__m128)__builtin_shufflevector((__v16sf)(__m512)(A),           \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                  0 + ((I) & 0x3) * 4,            \
+                                  1 + ((I) & 0x3) * 4,            \
+                                  2 + ((I) & 0x3) * 4,            \
+                                  3 + ((I) & 0x3) * 4); })
 
 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
-  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v4sf)(__m128)(W), \
-                                           (__mmask8)(U)); })
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)(W)); })
 
 #define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
-  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U)); })
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)_mm_setzero_ps()); })
+
 /* Vector Blend */
 
 static __inline __m512d __DEFAULT_FN_ATTRS
@@ -3556,10 +3540,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
 
 #define _mm512_cmp_ps_mask(A, B, P) \
   _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \
   _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
 
+#define _mm512_cmpeq_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
+
 #define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
                                          (__v8df)(__m512d)(B), (int)(P), \
@@ -3572,10 +3595,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
 
 #define _mm512_cmp_pd_mask(A, B, P) \
   _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \
   _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
 
+#define _mm512_cmpeq_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
+
 /* Conversion */
 
 #define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
@@ -3682,26 +3744,35 @@ _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepi32_pd(__m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
-                (__v8df)
-                _mm512_setzero_pd (),
-                (__mmask8) -1);
+  return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
-                (__v8df) __W,
-                (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepi32_pd(__A),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
-                (__v8df) _mm512_setzero_pd (),
-                (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepi32_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32lo_pd(__m512i __A)
+{
+  return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+  return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -3734,26 +3805,35 @@ _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepu32_pd(__m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
-                (__v8df)
-                _mm512_setzero_pd (),
-                (__mmask8) -1);
+  return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
-                  (__v8df) __W,
-                  (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepu32_pd(__A),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
-                  (__v8df) _mm512_setzero_pd (),
-                  (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepu32_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32lo_pd(__m512i __A)
+{
+  return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+  return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
 }
 
 #define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
@@ -3798,6 +3878,24 @@ _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
                 _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpd_pslo (__m512d __A)
+{
+  return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
+                (__v8sf) _mm256_setzero_ps (),
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
+{
+  return (__m512) __builtin_shufflevector (
+                (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
+                                               __U, __A),
+                (__v8sf) _mm256_setzero_ps (),
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
 #define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
   (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
                                             (__v16hi)_mm256_undefined_si256(), \
@@ -4919,263 +5017,227 @@ _mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi32 (__m128i __A)
+_mm512_cvtepi8_epi32(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) -1);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepi8_epi32(__A),
+                                             (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
+_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepi8_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi64 (__m128i __A)
+_mm512_cvtepi8_epi64(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi8_epi64(__A),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi8_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512 ());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi32_epi64 (__m256i __X)
+_mm512_cvtepi32_epi64(__m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi32_epi64(__X),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
+_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi32_epi64(__X),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi16_epi32 (__m256i __A)
+_mm512_cvtepi16_epi32(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) -1);
+  return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepi16_epi32(__A),
+                                            (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
+_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepi16_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512 ());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi16_epi64 (__m128i __A)
+_mm512_cvtepi16_epi64(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi16_epi64(__A),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi16_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi32 (__m128i __A)
+_mm512_cvtepu8_epi32(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) -1);
+  return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepu8_epi32(__A),
+                                             (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
+_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepu8_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi64 (__m128i __A)
+_mm512_cvtepu8_epi64(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu8_epi64(__A),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu8_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu32_epi64 (__m256i __X)
+_mm512_cvtepu32_epi64(__m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu32_epi64(__X),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
+_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu32_epi64(__X),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu16_epi32 (__m256i __A)
+_mm512_cvtepu16_epi32(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) -1);
+  return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepu16_epi32(__A),
+                                            (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
+_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepu16_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu16_epi64 (__m128i __A)
+_mm512_cvtepu16_epi64(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu16_epi64(__A),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu16_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -5393,67 +5455,91 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
                                         (__v8di)_mm512_setzero_si512(), \
                                         (__mmask8)(U)); })
 
-#define _mm512_slli_epi32(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
-
-#define _mm512_mask_slli_epi32(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
-
-#define _mm512_maskz_slli_epi32(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+}
 
-#define _mm512_slli_epi64(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_slli_epi32(__A, __B),
+                                         (__v16si)__W);
+}
 
-#define _mm512_mask_slli_epi64(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)(__m512i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_slli_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
 
-#define _mm512_maskz_slli_epi64(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+}
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_slli_epi64(__A, __B),
+                                          (__v8di)__W);
+}
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_slli_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
 
-#define _mm512_srli_epi32(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+}
 
-#define _mm512_mask_srli_epi32(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srli_epi32(__A, __B),
+                                         (__v16si)__W);
+}
 
-#define _mm512_maskz_srli_epi32(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srli_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
 
-#define _mm512_srli_epi64(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+}
 
-#define _mm512_mask_srli_epi64(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)(__m512i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srli_epi64(__A, __B),
+                                          (__v8di)__W);
+}
 
-#define _mm512_maskz_srli_epi64(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srli_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
@@ -5911,8 +5997,10 @@ _mm512_kmov (__mmask16 __A)
   (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
                               (int)(P), (int)(R)); })
 
+#ifdef __x86_64__
 #define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
   (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
@@ -5926,351 +6014,267 @@ _mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi32 (__m512i __A, __m128i __B)
+_mm512_sll_epi32(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sll_epi32(__A, __B),
+                                          (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sll_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi64 (__m512i __A, __m128i __B)
+_mm512_sll_epi64(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sll_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sll_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+_mm512_sllv_epi32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
+                                           (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+_mm512_sllv_epi64(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_undefined_pd (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
+                                            (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi32 (__m512i __A, __m128i __B)
+_mm512_sra_epi32(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sra_epi32(__A, __B),
+                                          (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sra_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi64 (__m512i __A, __m128i __B)
+_mm512_sra_epi64(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sra_epi64(__A, __B),
+                                           (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sra_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+_mm512_srav_epi32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srav_epi32(__X, __Y),
+                                           (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srav_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+_mm512_srav_epi64(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srav_epi64(__X, __Y),
+                                            (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srav_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi32 (__m512i __A, __m128i __B)
+_mm512_srl_epi32(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_srl_epi32(__A, __B),
+                                          (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_srl_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi64 (__m512i __A, __m128i __B)
+_mm512_srl_epi64(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_srl_epi64(__A, __B),
+                                           (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_srl_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+_mm512_srlv_epi32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
+                                           (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_srlv_epi64 (__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
+                                            (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
 #define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
@@ -6309,8 +6313,10 @@ _mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
                                              (__v8di)(__m512i)(C), (int)(imm), \
                                              (__mmask8)(U)); })
 
+#ifdef __x86_64__
 #define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
   (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
 
 #define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
   (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6328,6 +6334,7 @@ _mm_cvtsd_u32 (__m128d __A)
              _MM_FROUND_CUR_DIRECTION);
 }
 
+#ifdef __x86_64__
 #define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
   (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
                                                   (int)(R)); })
@@ -6339,6 +6346,7 @@ _mm_cvtsd_u64 (__m128d __A)
                  __A,
                  _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 #define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
   (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6346,11 +6354,13 @@ _mm_cvtsd_u64 (__m128d __A)
 #define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
   (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
 
+#ifdef __x86_64__
 #define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
   (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
 
 #define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
   (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#endif
 
 #define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
   (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6362,6 +6372,7 @@ _mm_cvtss_u32 (__m128 __A)
              _MM_FROUND_CUR_DIRECTION);
 }
 
+#ifdef __x86_64__
 #define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
   (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
                                                   (int)(R)); })
@@ -6373,6 +6384,7 @@ _mm_cvtss_u64 (__m128 __A)
                  __A,
                  _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 #define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
   (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6387,6 +6399,7 @@ _mm_cvttsd_i32 (__m128d __A)
               _MM_FROUND_CUR_DIRECTION);
 }
 
+#ifdef __x86_64__
 #define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
   (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
 
@@ -6399,6 +6412,7 @@ _mm_cvttsd_i64 (__m128d __A)
   return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
               _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 #define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
   (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6410,6 +6424,7 @@ _mm_cvttsd_u32 (__m128d __A)
               _MM_FROUND_CUR_DIRECTION);
 }
 
+#ifdef __x86_64__
 #define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
   (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
                                                    (int)(R)); })
@@ -6421,6 +6436,7 @@ _mm_cvttsd_u64 (__m128d __A)
                   __A,
                   _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 #define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
   (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6435,6 +6451,7 @@ _mm_cvttss_i32 (__m128 __A)
               _MM_FROUND_CUR_DIRECTION);
 }
 
+#ifdef __x86_64__
 #define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
   (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
 
@@ -6447,6 +6464,7 @@ _mm_cvttss_i64 (__m128 __A)
   return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
               _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 #define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
   (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6458,6 +6476,7 @@ _mm_cvttss_u32 (__m128 __A)
               _MM_FROUND_CUR_DIRECTION);
 }
 
+#ifdef __x86_64__
 #define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
   (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
                                                    (int)(R)); })
@@ -6469,6 +6488,7 @@ _mm_cvttss_u64 (__m128 __A)
                   __A,
                   _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
@@ -6556,61 +6576,47 @@ _mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
                                       (__v16sf)_mm512_setzero_ps()); })
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_permutevar_pd (__m512d __A, __m512i __C)
+_mm512_permutevar_pd(__m512d __A, __m512i __C)
 {
-  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
-              (__v8di) __C,
-              (__v8df)
-              _mm512_undefined_pd (),
-              (__mmask8) -1);
+  return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
 {
-  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
-              (__v8di) __C,
-              (__v8df) __W,
-              (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                         (__v8df)_mm512_permutevar_pd(__A, __C),
+                                         (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
 {
-  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
-              (__v8di) __C,
-              (__v8df)
-              _mm512_setzero_pd (),
-              (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                         (__v8df)_mm512_permutevar_pd(__A, __C),
+                                         (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_permutevar_ps (__m512 __A, __m512i __C)
+_mm512_permutevar_ps(__m512 __A, __m512i __C)
 {
-  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
-                   (__v16si) __C,
-                   (__v16sf)
-                   _mm512_undefined_ps (),
-                   (__mmask16) -1);
+  return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
 {
-  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
-                   (__v16si) __C,
-                   (__v16sf) __W,
-                   (__mmask16) __U);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
+                                        (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
 {
-  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
-                   (__v16si) __C,
-                   (__v16sf)
-                   _mm512_setzero_ps (),
-                   (__mmask16) __U);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
+                                        (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS
@@ -7028,35 +7034,48 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
                                              (__mmask8)(U), \
                                              _MM_FROUND_CUR_DIRECTION); })
 
-#define _mm512_srai_epi32(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+}
 
-#define _mm512_mask_srai_epi32(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+                                         (__v16si)_mm512_srai_epi32(__A, __B), \
+                                         (__v16si)__W);
+}
 
-#define _mm512_maskz_srai_epi32(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+                                         (__v16si)_mm512_srai_epi32(__A, __B), \
+                                         (__v16si)_mm512_setzero_si512());
+}
 
-#define _mm512_srai_epi64(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+}
 
-#define _mm512_mask_srai_epi64(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)(__m512i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+                                          (__v8di)_mm512_srai_epi64(__A, __B), \
+                                          (__v8di)__W);
+}
 
-#define _mm512_maskz_srai_epi64(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+                                          (__v8di)_mm512_srai_epi64(__A, __B), \
+                                          (__v8di)_mm512_setzero_si512());
+}
 
 #define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
   (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
@@ -7832,107 +7851,145 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
   __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
 }
 
-#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v4si)_mm_undefined_si128(), \
-                                            (__mmask8)-1); })
+#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({            \
+  (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A),             \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   0 + ((imm) & 0x3) * 4,             \
+                                   1 + ((imm) & 0x3) * 4,             \
+                                   2 + ((imm) & 0x3) * 4,             \
+                                   3 + ((imm) & 0x3) * 4); })
 
 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v4si)(__m128i)(W), \
-                                            (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+                                (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)__W); })
 
 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v4si)_mm_setzero_si128(), \
-                                            (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+                                (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)_mm_setzero_si128()); })
 
-#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                            (__v4di)_mm256_undefined_si256(), \
-                                            (__mmask8)-1); })
+#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({           \
+  (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A),             \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   ((imm) & 1) ? 4 : 0,              \
+                                   ((imm) & 1) ? 5 : 1,              \
+                                   ((imm) & 1) ? 6 : 2,              \
+                                   ((imm) & 1) ? 7 : 3); })
 
 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                            (__v4di)(__m256i)(W), \
-                                            (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,      \
+                                (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+                                (__v4di)__W); })
 
 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                            (__v4di)_mm256_setzero_si256(), \
-                                            (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,      \
+                                (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+                                (__v4di)_mm256_setzero_si256()); })
 
 #define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(imm), \
-                                           (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1); })
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                 (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
 
 #define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(imm), \
-                                           (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                  (__v8df)(W)); })
 
 #define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(imm), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                  (__v8df)_mm512_setzero_pd()); })
 
 #define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
-                                           (__v4di)(__m256i)(B), (int)(imm), \
-                                           (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)-1); })
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+                                 (__v8di)_mm512_castsi256_si512((__m256i)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
 
 #define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
-                                           (__v4di)(__m256i)(B), (int)(imm), \
-                                           (__v8di)(__m512i)(W), \
-                                           (__mmask8)(U)); })
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                  (__v8di)(W)); })
 
 #define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
-                                           (__v4di)(__m256i)(B), (int)(imm), \
-                                           (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)(U)); })
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512()); })
 
 #define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
-                                          (__v4sf)(__m128)(B), (int)(imm), \
-                                          (__v16sf)_mm512_undefined_ps(), \
-                                          (__mmask16)-1); })
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                  (__v16sf)_mm512_castps128_ps512((__m128)(B)),\
+                                  (((imm) & 0x3) == 0) ? 16 :  0, \
+                                  (((imm) & 0x3) == 0) ? 17 :  1, \
+                                  (((imm) & 0x3) == 0) ? 18 :  2, \
+                                  (((imm) & 0x3) == 0) ? 19 :  3, \
+                                  (((imm) & 0x3) == 1) ? 16 :  4, \
+                                  (((imm) & 0x3) == 1) ? 17 :  5, \
+                                  (((imm) & 0x3) == 1) ? 18 :  6, \
+                                  (((imm) & 0x3) == 1) ? 19 :  7, \
+                                  (((imm) & 0x3) == 2) ? 16 :  8, \
+                                  (((imm) & 0x3) == 2) ? 17 :  9, \
+                                  (((imm) & 0x3) == 2) ? 18 : 10, \
+                                  (((imm) & 0x3) == 2) ? 19 : 11, \
+                                  (((imm) & 0x3) == 3) ? 16 : 12, \
+                                  (((imm) & 0x3) == 3) ? 17 : 13, \
+                                  (((imm) & 0x3) == 3) ? 18 : 14, \
+                                  (((imm) & 0x3) == 3) ? 19 : 15); })
 
 #define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
-                                          (__v4sf)(__m128)(B), (int)(imm), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                 (__v16sf)(W)); })
 
 #define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
-                                          (__v4sf)(__m128)(B), (int)(imm), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                 (__v16sf)_mm512_setzero_ps()); })
 
 #define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
-                                           (__v4si)(__m128i)(B), (int)(imm), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)-1); })
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                 (__v16si)_mm512_castsi128_si512((__m128i)(B)),\
+                                 (((imm) & 0x3) == 0) ? 16 :  0, \
+                                 (((imm) & 0x3) == 0) ? 17 :  1, \
+                                 (((imm) & 0x3) == 0) ? 18 :  2, \
+                                 (((imm) & 0x3) == 0) ? 19 :  3, \
+                                 (((imm) & 0x3) == 1) ? 16 :  4, \
+                                 (((imm) & 0x3) == 1) ? 17 :  5, \
+                                 (((imm) & 0x3) == 1) ? 18 :  6, \
+                                 (((imm) & 0x3) == 1) ? 19 :  7, \
+                                 (((imm) & 0x3) == 2) ? 16 :  8, \
+                                 (((imm) & 0x3) == 2) ? 17 :  9, \
+                                 (((imm) & 0x3) == 2) ? 18 : 10, \
+                                 (((imm) & 0x3) == 2) ? 19 : 11, \
+                                 (((imm) & 0x3) == 3) ? 16 : 12, \
+                                 (((imm) & 0x3) == 3) ? 17 : 13, \
+                                 (((imm) & 0x3) == 3) ? 18 : 14, \
+                                 (((imm) & 0x3) == 3) ? 19 : 15); })
 
 #define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
-                                           (__v4si)(__m128i)(B), (int)(imm), \
-                                           (__v16si)(__m512i)(W), \
-                                           (__mmask16)(U)); })
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                 (__v16si)(W)); })
 
 #define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
-                                           (__v4si)(__m128i)(B), (int)(imm), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)(U)); })
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512()); })
 
 #define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
   (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
@@ -8275,17 +8332,17 @@ __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          (__v4sf) __A,
           (__v4sf) __B,
-          (__v4sf) __W,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        (__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
                                         (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8323,17 +8380,17 @@ _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          (__v4sf) __A,
           -(__v4sf) __B,
-          (__v4sf) __W,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                        -(__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        (__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
                                         (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8355,33 +8412,33 @@ _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
           (__v4sf) __X,
-          -(__v4sf) __Y,
+          (__v4sf) __Y,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+  (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
                                          (__v4sf)(__m128)(X), \
-                                         -(__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          -(__v4sf) __A,
           (__v4sf) __B,
-          (__v4sf) __W,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        -(__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
                                         (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8419,17 +8476,17 @@ _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          -(__v4sf) __A,
           -(__v4sf) __B,
-          (__v4sf) __W,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
-                                        -(__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        -(__v4sf)(__m128)(A), \
+                                        -(__v4sf)(__m128)(B), (__mmask8)(U), \
                                         (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8451,33 +8508,33 @@ _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
+ return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W,
           (__v4sf) __X,
-          -(__v4sf) __Y,
+          (__v4sf) __Y,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
-  (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
+  (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \
                                          (__v4sf)(__m128)(X), \
-                                         -(__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          (__v2df) __A,
           (__v2df) __B,
-          (__v2df) __W,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8515,17 +8572,17 @@ _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          (__v2df) __A,
           -(__v2df) __B,
-          (__v2df) __W,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                         -(__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8547,33 +8604,33 @@ _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
           (__v2df) __X,
-          -(__v2df) __Y,
+          (__v2df) __Y,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+  (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
                                           (__v2df)(__m128d)(X), \
-                                          -(__v2df)(__m128d)(Y), \
+                                          (__v2df)(__m128d)(Y), \
                                           (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          -(__v2df) __A,
           (__v2df) __B,
-          (__v2df) __W,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         -(__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8611,17 +8668,17 @@ _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          -(__v2df) __A,
           -(__v2df) __B,
-          (__v2df) __W,
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
-                                         -(__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         -(__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8644,17 +8701,17 @@ _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) (__W),
+ return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W),
           (__v2df) __X,
-          -(__v2df) (__Y),
+          (__v2df) (__Y),
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
+  (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \
                                           (__v2df)(__m128d)(X), \
-                                          -(__v2df)(__m128d)(Y), \
+                                          (__v2df)(__m128d)(Y), \
                                           (__mmask8)(U), (int)(R)); })
 
 #define _mm512_permutex_pd(X, C) __extension__ ({ \
@@ -9041,6 +9098,101 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
                                              (__v16sf)_mm512_setzero_ps());
 }
 
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  __m128 res = __A; 
+  res[0] = (__U & 1) ? __B[0] : __W[0];
+  return res; 
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  __m128 res = __A; 
+  res[0] = (__U & 1) ? __B[0] : 0; 
+  return res; 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  __m128d res = __A; 
+  res[0] = (__U & 1) ? __B[0] : __W[0];
+  return res; 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  __m128d res = __A; 
+  res[0] = (__U & 1) ? __B[0] : 0; 
+  return res; 
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storess128_mask ((__v16sf *)__W, 
+                (__v16sf) _mm512_castps128_ps512(__A),
+                (__mmask16) __U & (__mmask16)1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storesd128_mask ((__v8df *)__W, 
+                (__v8df) _mm512_castpd128_pd512(__A),
+                (__mmask8) __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
+{
+  __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
+                                                (__v4sf) {0.0, 0.0, 0.0, 0.0},
+                                                0, 4, 4, 4);
+
+  return (__m128) __builtin_shufflevector(
+                           __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+                                      (__v16sf) _mm512_castps128_ps512(src),
+                                      (__mmask16) __U & 1),
+                           _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_load_ss (__mmask8 __U, const float* __A)
+{
+  return (__m128) __builtin_shufflevector(
+                           __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+                                      (__v16sf) _mm512_setzero_ps(),
+                                      (__mmask16) __U & 1),
+                           _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
+{
+  __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
+                                                 (__v2df) {0.0, 0.0}, 0, 2);
+
+  return (__m128d) __builtin_shufflevector(
+                            __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+                                      (__v8df) _mm512_castpd128_pd512(src),
+                                      (__mmask8) __U & 1),
+                            _mm512_undefined_pd(), 0, 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_load_sd (__mmask8 __U, const double* __A)
+{
+  return (__m128d) __builtin_shufflevector(
+                            __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+                                      (__v8df) _mm512_setzero_pd(),
+                                      (__mmask8) __U & 1),
+                            _mm512_undefined_pd(), 0, 1);
+}
+
 #define _mm512_shuffle_epi32(A, I) __extension__ ({ \
   (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
                                    (__v16si)_mm512_undefined_epi32(), \
@@ -9243,6 +9395,18 @@ _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
                 _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpslo_pd (__m512 __A)
+{
+  return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
+{
+  return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
+}
+
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
@@ -9340,14 +9504,17 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
 }
 
 #define _mm_cvtss_i32 _mm_cvtss_si32
-#define _mm_cvtss_i64 _mm_cvtss_si64
 #define _mm_cvtsd_i32 _mm_cvtsd_si32
-#define _mm_cvtsd_i64 _mm_cvtsd_si64
 #define _mm_cvti32_sd _mm_cvtsi32_sd
-#define _mm_cvti64_sd _mm_cvtsi64_sd
 #define _mm_cvti32_ss _mm_cvtsi32_ss
+#ifdef __x86_64__
+#define _mm_cvtss_i64 _mm_cvtss_si64
+#define _mm_cvtsd_i64 _mm_cvtsd_si64
+#define _mm_cvti64_sd _mm_cvtsi64_sd
 #define _mm_cvti64_ss _mm_cvtsi64_ss
+#endif
 
+#ifdef __x86_64__
 #define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
   (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
                                      (int)(R)); })
@@ -9355,6 +9522,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
 #define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
   (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
                                      (int)(R)); })
+#endif
 
 #define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
   (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
@@ -9362,6 +9530,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
 #define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
   (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
 
+#ifdef __x86_64__
 #define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
   (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
                                     (int)(R)); })
@@ -9369,6 +9538,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
 #define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
   (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
                                     (int)(R)); })
+#endif
 
 #define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
   (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
@@ -9412,6 +9582,7 @@ _mm_cvtu32_sd (__m128d __A, unsigned __B)
   return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
 }
 
+#ifdef __x86_64__
 #define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
   (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
                                       (unsigned long long)(B), (int)(R)); })
@@ -9422,6 +9593,7 @@ _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
   return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
                  _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 #define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
   (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
@@ -9434,6 +9606,7 @@ _mm_cvtu32_ss (__m128 __A, unsigned __B)
                 _MM_FROUND_CUR_DIRECTION);
 }
 
+#ifdef __x86_64__
 #define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
   (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
                                      (unsigned long long)(B), (int)(R)); })
@@ -9444,6 +9617,7 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
   return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
                 _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
@@ -9452,12 +9626,14 @@ _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
                  __M);
 }
 
+#ifdef __x86_64__
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
 {
   return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
                  __M);
 }
+#endif
 
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_set_epi32 (int __A, int __B, int __C, int __D,
@@ -9514,27 +9690,553 @@ _mm512_set_ps (float __A, float __B, float __C, float __D,
                 (e4),(e3),(e2),(e1),(e0))
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_abs_ps(__m512 A)
+_mm512_abs_ps(__m512 __A)
 {
-  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_abs_ps(__m512 W, __mmask16 K, __m512 A)
+_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
 {
-  return (__m512)_mm512_mask_and_epi32((__m512i)W, K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+  return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_abs_pd(__m512d A)
+_mm512_abs_pd(__m512d __A)
 {
-  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A) ;
+  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_abs_pd(__m512d W, __mmask8 K, __m512d A)
-{
-  return (__m512d)_mm512_mask_and_epi64((__v8di)W, K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A);
+_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
+{
+  return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
+}
+
+// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
+// outputs. This class of vector operation forms the basis of many scientific
+// computations. In vector-reduction arithmetic, the evaluation off is
+// independent of the order of the input elements of V.
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+
+#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1)         \
+  __extension__({                                                      \
+    __m256##T1 Vec256 = __builtin_shufflevector(                       \
+                            (__v8d##T2)Vec512,                         \
+                            (__v8d##T2)Vec512,                         \
+                            0, 1, 2, 3)                                \
+                        Operator                                       \
+                        __builtin_shufflevector(                       \
+                            (__v8d##T2)Vec512,                         \
+                            (__v8d##T2)Vec512,                         \
+                            4, 5, 6, 7);                               \
+    __m128##T1 Vec128 = __builtin_shufflevector(                       \
+                            (__v4d##T2)Vec256,                         \
+                            (__v4d##T2)Vec256,                         \
+                            0, 1)                                      \
+                        Operator                                       \
+                        __builtin_shufflevector(                       \
+                            (__v4d##T2)Vec256,                         \
+                            (__v4d##T2)Vec256,                         \
+                            2, 3);                                     \
+    Vec128 = __builtin_shufflevector((__v2d##T2)Vec128,                \
+                                     (__v2d##T2)Vec128, 0, -1)         \
+             Operator                                                  \
+             __builtin_shufflevector((__v2d##T2)Vec128,                \
+                                     (__v2d##T2)Vec128, 1, -1);        \
+    return Vec128[0];                                                  \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, +, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, *, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, &, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, |, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
+  _mm512_reduce_operator_64bit(__W, +, f, d);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
+  _mm512_reduce_operator_64bit(__W, *, f, d);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element. 
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for packed double-precision.
+// T3 - Can be Pd for packed double or q for q-word.
+
+#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator,     \
+                                          Mask, T2, T1, T3)                    \
+  __extension__({                                                              \
+    Vec512 = __builtin_ia32_select##T3##_512(                                  \
+                 (__mmask8)Mask,                                               \
+                 (__v8d##T2)Vec512,                                            \
+                 (__v8d##T2)Vec512Neutral);                                    \
+    _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1);                    \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), 
+                                    &, __M,  i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, 
+                                    i, i, q);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, 
+                                    f, d, pd);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M,
+                                    f, d, pd);
+}
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2 - Can get 'i' for int and ' ' for packed single.
+// T1 - Can get 'i' for int and 'f' for float.
+
+#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \
+    __m256##T1 Vec256 =                                                        \
+            (__m256##T1)(__builtin_shufflevector(                              \
+                                    (__v16s##T2)Vec512,                        \
+                                    (__v16s##T2)Vec512,                        \
+                                    0, 1, 2, 3, 4, 5, 6, 7)                    \
+                                Operator                                       \
+                         __builtin_shufflevector(                              \
+                                    (__v16s##T2)Vec512,                        \
+                                    (__v16s##T2)Vec512,                        \
+                                    8, 9, 10, 11, 12, 13, 14, 15));            \
+    __m128##T1 Vec128 =                                                        \
+             (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v8s##T2)Vec256,                         \
+                                    (__v8s##T2)Vec256,                         \
+                                    0, 1, 2, 3)                                \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v8s##T2)Vec256,                         \
+                                    (__v8s##T2)Vec256,                         \
+                                    4, 5, 6, 7));                              \
+    Vec128 = (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    0, 1, -1, -1)                              \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    2, 3, -1, -1));                            \
+    Vec128 = (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    0, -1, -1, -1)                             \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    1, -1, -1, -1));                           \
+    return Vec128[0];                                                          \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_add_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, +, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_mul_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, *, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_and_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, &, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_or_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, |, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_add_ps(__m512 __W) {
+  _mm512_reduce_operator_32bit(__W, +, f, );
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_mul_ps(__m512 __W) {
+  _mm512_reduce_operator_32bit(__W, *, f, );
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element. 
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+// T3 - Can be Ps for packed single or d for d-word.
+
+#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator,     \
+                                          Mask, T2, T1, T3)                    \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                             (__mmask16)Mask,                                  \
+                             (__v16s##T2)Vec512,                               \
+                             (__v16s##T2)Vec512Neutral);                       \
+    _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1);                    \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, 
+                                    i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
+}
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+// This macro uses only intrinsics from the AVX512F feature.
+
+// Vec512 - Vector with size of 512.
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+//              __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+
+#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 0, 1, 2, 3, -1, -1, -1, -1),  \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 4, 5, 6, 7, -1, -1, -1, -1)); \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 0, 1, -1, -1, -1, -1, -1, -1),\
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 2, 3, -1, -1, -1, -1, -1,     \
+                                                 -1));                         \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                0, -1, -1, -1, -1, -1, -1, -1),\
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                1, -1, -1, -1, -1, -1, -1, -1))\
+                                                ;                              \
+    return Vec512[0];                                                          \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS 
+_mm512_reduce_max_epi64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS 
+_mm512_reduce_max_pd(__m512d __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
+(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS 
+_mm512_reduce_min_pd(__m512d __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x8000000000000000}
+//                   {max_epu,0x0000000000000000}
+//                   {max_pd, 0xFFF0000000000000}
+//                   {min_epi,0x7FFFFFFFFFFFFFFF}
+//                   {min_epu,0xFFFFFFFFFFFFFFFF}
+//                   {min_pd, 0x7FF0000000000000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+//              __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+//      [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
+                                        T2, T3, Mask)                          \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                             (__mmask8)Mask,                                   \
+                             (__v8d##T2)Vec512,                                \
+                             (__v8d##T2)Vec512Neutral);                        \
+    _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2);                    \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
+                                  max_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
+                                  max_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()),
+                                  max_pd, d, f, pd, __M);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                                  min_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+                                  min_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()),
+                                  min_pd, d, f, pd, __M);
+}
+
+// Vec512 - Vector with size 512.
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+//              __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+
+#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, 2, 3, 4, 5, 6, 7,                      \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  8, 9, 10, 11, 12, 13, 14, 15,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, 2, 3, -1, -1, -1, -1,                  \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  4, 5, 6, 7, -1, -1, -1, -1,                  \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, -1, -1, -1, -1, -1, -1,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  2, 3, -1, -1, -1, -1, -1, -1,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0,  -1, -1, -1, -1, -1, -1, -1,              \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  1, -1, -1, -1, -1, -1, -1, -1,               \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    return Vec512[0];                                                          \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
+  _mm512_reduce_maxMin_32bit(a, max_ps, , f);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
+  _mm512_reduce_maxMin_32bit(a, min_ps, , f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x80000000}
+//                   {max_epu,0x00000000}
+//                   {max_ps, 0xFF800000}
+//                   {min_epi,0x7FFFFFFF}
+//                   {min_epu,0xFFFFFFFF}
+//                   {min_ps, 0x7F800000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+//              __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+//      [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
+                                        T2, T3, Mask)                          \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                                        (__mmask16)Mask,                       \
+                                        (__v16s##T2)Vec512,                    \
+                                        (__v16s##T2)Vec512Neutral);            \
+   _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2);                     \
+   })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
+                                  i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
+                                  i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f,
+                                  ps, __M);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
+                                  i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
+                                  i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f,
+                                  ps, __M);
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/avx512vlbwintrin.h b/lib/Headers/avx512vlbwintrin.h
index 990e992a113f..3b58d043395a 100644
--- a/lib/Headers/avx512vlbwintrin.h
+++ b/lib/Headers/avx512vlbwintrin.h
@@ -615,172 +615,143 @@ _mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
-  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
-             (__v32qi) __B,
-             (__v32qi) __W,
-             (__mmask32) __U);
+_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_add_epi8(__A, __B),
+                                             (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
-             (__v32qi) __B,
-             (__v32qi)
-             _mm256_setzero_si256 (),
-             (__mmask32) __U);
+_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_add_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
-             (__v16hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
+_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_add_epi16(__A, __B),
+                                             (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
-             (__v16hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
+_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_add_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
-             (__v32qi) __B,
-             (__v32qi) __W,
-             (__mmask32) __U);
+_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_sub_epi8(__A, __B),
+                                             (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
-             (__v32qi) __B,
-             (__v32qi)
-             _mm256_setzero_si256 (),
-             (__mmask32) __U);
+_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_sub_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
-             (__v16hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
+_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_sub_epi16(__A, __B),
+                                             (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
-             (__v16hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
+_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_sub_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
 }
+
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
-             (__v16qi) __B,
-             (__v16qi) __W,
-             (__mmask16) __U);
+_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_add_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
-             (__v16qi) __B,
-             (__v16qi)
-             _mm_setzero_si128 (),
-             (__mmask16) __U);
+_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_add_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_add_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_add_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
-             (__v16qi) __B,
-             (__v16qi) __W,
-             (__mmask16) __U);
+_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_sub_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
-             (__v16qi) __B,
-             (__v16qi)
-             _mm_setzero_si128 (),
-             (__mmask16) __U);
+_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_sub_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sub_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sub_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi) __W,
-              (__mmask16) __U);
+_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
+                                             (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) __U);
+_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
-              (__v8hi) __B,
-              (__v8hi) __W,
-              (__mmask8) __U);
+_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mullo_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
-              (__v8hi) __B,
-              (__v8hi)
-              _mm_setzero_si128 (),
-              (__mmask8) __U);
+_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mullo_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -816,937 +787,802 @@ _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_abs_epi8(__A),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
+_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_abs_epi8(__A),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_abs_epi8(__A),
+                                             (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_abs_epi8(__A),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_abs_epi16(__A),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
+_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_abs_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_abs_epi16(__A),
+                                             (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
+_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_abs_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
-               (__v4si) __B,
-               (__v8hi) _mm_setzero_si128 (), __M);
+_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packs_epi32(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packs_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
+_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
-               (__v4si) __B,
-               (__v8hi) __W, __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packs_epi32(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
-               (__v8si) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                          (__v16hi)_mm256_packs_epi32(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
-       __m256i __B)
+_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
-               (__v8si) __B,
-               (__v16hi) __W, __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                          (__v16hi)_mm256_packs_epi32(__A, __B),
+                                          (__v16hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_packs_epi16(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
+_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v16qi) __W,
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_packs_epi16(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                          (__v32qi)_mm256_packs_epi16(__A, __B),
+                                          (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
-       __m256i __B)
+_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v32qi) __W,
-               __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                          (__v32qi)_mm256_packs_epi16(__A, __B),
+                                          (__v32qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
-               (__v4si) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packus_epi32(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packus_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
-           __m128i __B)
+_mm_mask_packus_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
-               (__v4si) __B,
-               (__v8hi) __W, __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packus_epi32(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
-               (__v8si) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                         (__v16hi)_mm256_packus_epi32(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
-        __m256i __B)
+_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
-               (__v8si) __B,
-               (__v16hi) __W,
-               __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                         (__v16hi)_mm256_packus_epi32(__A, __B),
+                                         (__v16hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                            (__v16qi)_mm_packus_epi16(__A, __B),
+                                            (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
-           __m128i __B)
+_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v16qi) __W,
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                            (__v16qi)_mm_packus_epi16(__A, __B),
+                                            (__v16qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                         (__v32qi)_mm256_packus_epi16(__A, __B),
+                                         (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
-        __m256i __B)
+_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v32qi) __W,
-               __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                         (__v32qi)_mm256_packus_epi16(__A, __B),
+                                         (__v32qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epi8(__A, __B),
+                                            (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epi8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epu8(__A, __B),
+                                            (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epu8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epu16(__A, __B),
+                                           (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epu16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
-       __m128i __B)
+_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_avg_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_avg_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
-          __m256i __B)
+_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_avg_epu8(__A, __B),
+                                             (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_avg_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_avg_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_avg_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                            (__v16hi)_mm256_avg_epu16(__A, __B),
+                                            (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                            (__v16hi)_mm256_avg_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-       __m128i __B)
+_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-          __m256i __B)
+_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epi8(__A, __B),
+                                             (__v32qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B)
+_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epi16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
-           __m256i __B)
+_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epi16(__A, __B),
+                                            (__v16hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
-       __m128i __B)
+_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
-          __m256i __B)
+_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epu8(__A, __B),
+                                             (__v32qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B)
+_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
-           __m256i __B)
+_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epu16(__A, __B),
+                                            (__v16hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-       __m128i __B)
+_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-          __m256i __B)
+_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epi8(__A, __B),
+                                             (__v32qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B)
+_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epi16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
-           __m256i __B)
+_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epi16(__A, __B),
+                                            (__v16hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
-       __m128i __B)
+_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
-          __m256i __B)
+_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epu8(__A, __B),
+                                             (__v32qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B)
+_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
-           __m256i __B)
+_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epu16(__A, __B),
+                                            (__v16hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
-           __m128i __B)
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
+                                            (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
+                                            (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
-        __m256i __B)
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
+                                         (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
+                                         (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epi8(__A, __B),
+                                            (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epi8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epu8(__A, __B),
+                                            (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epu8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
-{
-  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epu16(__A, __B),
+                                           (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epu16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -1828,69 +1664,60 @@ _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
-               (__v16qi) __Y,
-               (__v8hi) __W,
-               (__mmask8) __U);
+_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
+                                            (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
-               (__v16qi) __Y,
-              (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
+                                            (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
-         __m256i __Y) {
-  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
-               (__v32qi) __Y,
-               (__v16hi) __W,
-               (__mmask16) __U);
+_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
+                          __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+                                        (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
-               (__v32qi) __Y,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+                                        (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v4si) __W,
-               (__mmask8) __U);
+_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_madd_epi16(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v4si) _mm_setzero_si128(),
-               (__mmask8) __U);
+_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_madd_epi16(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v8si) __W,
-               (__mmask8) __U);
+_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_madd_epi16(__A, __B),
+                                            (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v8si) _mm256_setzero_si256(),
-               (__mmask8) __U);
+_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_madd_epi16(__A, __B),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -2056,104 +1883,89 @@ _mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
 }
+
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
-               (__v8hi) __Y,
-               (__v8hi) __W,
-               (__mmask8) __U);
+_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
-               (__v8hi) __Y,
-              (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
-               (__v16hi) __Y,
-               (__v16hi) __W,
-               (__mmask16) __U);
+_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+                                         (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
-               (__v16hi) __Y,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+                                         (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
-          __m128i __B) {
-  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-              (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
-       __m256i __B) {
-  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
+                                          (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-          __m128i __B) {
-  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-              (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-       __m256i __B) {
-  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
+                                          (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -2269,72 +2081,68 @@ _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A)
+_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
-                (__v8hi) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepi8_epi16(__A),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
-                (__v8hi)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepi8_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A)
+_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
-                (__v16hi) __W,
-                (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
+                                             (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A)
+_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
-                (__v16hi)
-                _mm256_setzero_si256 (),
-                (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A)
+_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
-                (__v8hi) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepu8_epi16(__A),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
-                (__v8hi)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepu8_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A)
+_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
-                (__v16hi) __W,
-                (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
+                                             (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
-                (__v16hi)
-                _mm256_setzero_si256 (),
-                (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
 
@@ -2461,366 +2269,328 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
                                       (__v16hi)_mm256_setzero_si256()); })
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_sllv_epi16 (__m256i __A, __m256i __B)
+_mm256_sllv_epi16(__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) -1);
+  return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi) __W,
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sllv_epi16 (__m128i __A, __m128i __B)
+_mm_sllv_epi16(__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_hi (),
-             (__mmask8) -1);
+  return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sllv_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sllv_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sll_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sll_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sll_epi16(__A, __B),
+                                          (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sll_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
-#define _mm_mask_slli_epi16(W, U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \
-                                         (__v8hi)(__m128i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm_maskz_slli_epi16(U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \
-                                         (__v8hi)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_mask_slli_epi16(W, U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \
-                                         (__v16hi)(__m256i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
 
-#define _mm256_maskz_slli_epi16(U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \
-                                         (__v16hi)_mm256_setzero_si256(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
 
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
 
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srlv_epi16 (__m256i __A, __m256i __B)
+_mm256_srlv_epi16(__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) -1);
+  return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi) __W,
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srlv_epi16 (__m128i __A, __m128i __B)
+_mm_srlv_epi16(__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_hi (),
-             (__mmask8) -1);
+  return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srlv_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srlv_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srav_epi16 (__m256i __A, __m256i __B)
+_mm256_srav_epi16(__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) -1);
+  return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi) __W,
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srav_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srav_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srav_epi16 (__m128i __A, __m128i __B)
+_mm_srav_epi16(__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_hi (),
-             (__mmask8) -1);
+  return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srav_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srav_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sra_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sra_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sra_epi16(__A, __B),
+                                          (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sra_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srai_epi16(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
-                                         (__v8hi)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
 
-#define _mm_maskz_srai_epi16(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
-                                         (__v8hi)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
 
-#define _mm256_mask_srai_epi16(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
-                                         (__v16hi)(__m256i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
 
-#define _mm256_maskz_srai_epi16(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
-                                         (__v16hi)_mm256_setzero_si256(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srl_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srl_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_srl_epi16(__A, __B),
+                                          (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_srl_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
-                                         (__v8hi)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srli_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
 
-#define _mm_maskz_srli_epi16(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
-                                         (__v8hi)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srli_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
 
-#define _mm256_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
-                                         (__v16hi)(__m256i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srli_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
 
-#define _mm256_maskz_srli_epi16(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
-                                         (__v16hi)_mm256_setzero_si256(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srli_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
@@ -3342,28 +3112,24 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
 }
 
 #define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \
-                                          (__v16qi)(__m128i)(B), (int)(N), \
-                                          (__v16qi)(__m128i)(W), \
-                                          (__mmask16)(U)); })
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+                                 (__v16qi)(__m128i)(W)); })
 
 #define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \
-                                          (__v16qi)(__m128i)(B), (int)(N), \
-                                          (__v16qi)_mm_setzero_si128(), \
-                                          (__mmask16)(U)); })
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+                                 (__v16qi)_mm_setzero_si128()); })
 
 #define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
-  (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \
-                                          (__v32qi)(__m256i)(B), (int)(N), \
-                                          (__v32qi)(__m256i)(W), \
-                                          (__mmask32)(U)); })
+  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+                              (__v32qi)(__m256i)(W)); })
 
 #define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
-  (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \
-                                          (__v32qi)(__m256i)(B), (int)(N), \
-                                          (__v32qi)_mm256_setzero_si256(), \
-                                          (__mmask32)(U)); })
+  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+                              (__v32qi)_mm256_setzero_si256()); })
 
 #define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \
   (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
diff --git a/lib/Headers/avx512vldqintrin.h b/lib/Headers/avx512vldqintrin.h
index 8187bcd6b28e..cd9da4370564 100644
--- a/lib/Headers/avx512vldqintrin.h
+++ b/lib/Headers/avx512vldqintrin.h
@@ -37,20 +37,17 @@ _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W,
-              (__mmask8) __U);
+_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_mullo_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_mullo_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -59,293 +56,241 @@ _mm_mullo_epi64 (__m128i __A, __m128i __B) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W,
-              (__mmask8) __U);
+_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_mullo_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              (__mmask8) __U);
+_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_mullo_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
-              (__v4df) __B,
-              (__v4df) __W,
-              (__mmask8) __U);
+_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_andnot_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
-              (__v4df) __B,
-              (__v4df)
-              _mm256_setzero_pd (),
-              (__mmask8) __U);
+_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_andnot_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
-              (__v2df) __B,
-              (__v2df) __W,
-              (__mmask8) __U);
+_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_andnot_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
-              (__v2df) __B,
-              (__v2df)
-              _mm_setzero_pd (),
-              (__mmask8) __U);
+_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_andnot_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
-             (__v8sf) __B,
-             (__v8sf) __W,
-             (__mmask8) __U);
+_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_andnot_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
-             (__v8sf) __B,
-             (__v8sf)
-             _mm256_setzero_ps (),
-             (__mmask8) __U);
+_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_andnot_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
-             (__v4sf) __B,
-             (__v4sf) __W,
-             (__mmask8) __U);
+_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_andnot_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
-             (__v4sf) __B,
-             (__v4sf)
-             _mm_setzero_ps (),
-             (__mmask8) __U);
+_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_andnot_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_and_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_and_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df) __W,
-             (__mmask8) __U);
+_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_and_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df)
-             _mm_setzero_pd (),
-             (__mmask8) __U);
+_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_and_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_and_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_and_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf) __W,
-            (__mmask8) __U);
+_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_and_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
+_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_and_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_xor_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_xor_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df) __W,
-             (__mmask8) __U);
+_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_xor_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df)
-             _mm_setzero_pd (),
-             (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_xor_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_xor_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_xor_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf) __W,
-            (__mmask8) __U);
+_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_xor_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
+_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_xor_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
-            (__v4df) __B,
-            (__v4df) __W,
-            (__mmask8) __U);
+_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_or_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
-            (__v4df) __B,
-            (__v4df)
-            _mm256_setzero_pd (),
-            (__mmask8) __U);
+_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_or_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
-            (__v2df) __B,
-            (__v2df) __W,
-            (__mmask8) __U);
+_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_or_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
-            (__v2df) __B,
-            (__v2df)
-            _mm_setzero_pd (),
-            (__mmask8) __U);
+_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_or_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
-                 (__v8sf) __B,
-                 (__v8sf) __W,
-                 (__mmask8) __U);
+_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_or_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
-                 (__v8sf) __B,
-                 (__v8sf)
-                 _mm256_setzero_ps (),
-                 (__mmask8) __U);
+_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_or_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
-                 (__v4sf) __B,
-                 (__v4sf) __W,
-                 (__mmask8) __U);
+_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_or_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
-                 (__v4sf) __B,
-                 (__v4sf)
-                 _mm_setzero_ps (),
-                 (__mmask8) __U);
+_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_or_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -1151,82 +1096,72 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
 }
 
 #define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1); })
+  (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A),           \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((imm) & 1) ? 2 : 0,           \
+                                   ((imm) & 1) ? 3 : 1); })
 
 #define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U)); })
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)(W)); })
 
 #define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U)); })
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)_mm_setzero_pd()); })
 
 #define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_setzero_di(), \
-                                                (__mmask8)-1); })
+  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A),             \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((imm) & 1) ? 2 : 0,              \
+                                   ((imm) & 1) ? 3 : 1); })
 
 #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)(__m128i)(W), \
-                                                (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)(W)); })
 
 #define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_setzero_di(), \
-                                                (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)_mm_setzero_di()); })
 
 #define _mm256_insertf64x2(A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v4df)_mm256_setzero_pd(), \
-                                               (__mmask8)-1); })
+  (__m256d)__builtin_shufflevector((__v4df)(A), \
+                                 (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
+                                 ((imm) & 0x1) ? 0 : 4, \
+                                 ((imm) & 0x1) ? 1 : 5, \
+                                 ((imm) & 0x1) ? 4 : 2, \
+                                 ((imm) & 0x1) ? 5 : 3); })
 
 #define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v4df)(__m256d)(W), \
-                                               (__mmask8)(U)); })
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+                                  (__v4df)(W)); })
 
 #define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v4df)_mm256_setzero_pd(), \
-                                               (__mmask8)(U)); })
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+                                  (__v4df)_mm256_setzero_pd()); })
 
 #define _mm256_inserti64x2(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v4di)_mm256_setzero_si256(), \
-                                               (__mmask8)-1); })
+  (__m256i)__builtin_shufflevector((__v4di)(A), \
+                                 (__v4di)_mm256_castsi128_si256((__m128i)(B)), \
+                                 ((imm) & 0x1) ? 0 : 4, \
+                                 ((imm) & 0x1) ? 1 : 5, \
+                                 ((imm) & 0x1) ? 4 : 2, \
+                                 ((imm) & 0x1) ? 5 : 3); })
 
 #define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v4di)(__m256i)(W), \
-                                               (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                  (__v4di)(W)); })
 
 #define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v4di)_mm256_setzero_si256(), \
-                                               (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                  (__v4di)_mm256_setzero_si256()); })
 
 #define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
   (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
diff --git a/lib/Headers/avx512vlintrin.h b/lib/Headers/avx512vlintrin.h
index 295ce291f7ce..f3744da6ab8a 100644
--- a/lib/Headers/avx512vlintrin.h
+++ b/lib/Headers/avx512vlintrin.h
@@ -616,277 +616,227 @@ _mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_add_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_add_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_add_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_add_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sub_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sub_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sub_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sub_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_add_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_add_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_add_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_add_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sub_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sub_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sub_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sub_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
-           __m256i __Y)
+_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
-              (__v8si) __Y,
-              (__v4di) __W, __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epi32(__X, __Y),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
-              (__v8si) __Y,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epi32(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
-        __m128i __Y)
+_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
-              (__v4si) __Y,
-              (__v2di) __W, __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epi32(__X, __Y),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
-              (__v4si) __Y,
-              (__v2di)
-              _mm_setzero_si128 (),
-              __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epi32(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
-           __m256i __Y)
+_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
-               (__v8si) __Y,
-               (__v4di) __W, __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epu32(__X, __Y),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
-               (__v8si) __Y,
-               (__v4di)
-               _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epu32(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
-        __m128i __Y)
+_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
-               (__v4si) __Y,
-               (__v2di) __W, __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epu32(__X, __Y),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
-               (__v4si) __Y,
-               (__v2di)
-               _mm_setzero_si128 (),
-               __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epu32(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_mullo_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
-       __m256i __B)
+_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_mullo_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_mullo_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
+_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_mullo_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -1895,71 +1845,59 @@ _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df) __W,
-             (__mmask8) __U);
+_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_add_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df)
-             _mm_setzero_pd (),
-             (__mmask8) __U);
+_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_add_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_add_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_add_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf) __W,
-            (__mmask8) __U);
+_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_add_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
+_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_add_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_add_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_add_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -2196,32 +2134,30 @@ _mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
-                (__v2df) __W,
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepi32_pd(__A),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepi32_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
-  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
-                (__v4df) __W,
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepi32_pd(__A),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
-  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepi32_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2620,48 +2556,41 @@ _mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtepu32_pd (__m128i __A) {
-  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) -1);
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
-                 (__v2df) __W,
-                 (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepu32_pd(__A),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepu32_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtepu32_pd (__m128i __A) {
-  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
-                 (__v4df)
-                 _mm256_setzero_pd (),
-                 (__mmask8) -1);
+  return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
-  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
-                 (__v4df) __W,
-                 (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepu32_pd(__A),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
-  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
-                 (__v4df)
-                 _mm256_setzero_pd (),
-                 (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepu32_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2711,72 +2640,59 @@ _mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
+_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_div_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_div_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_div_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_div_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
+_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_div_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_div_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_div_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_div_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -3127,240 +3043,199 @@ _mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
+_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_max_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_max_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_max_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_max_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
+_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_max_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_max_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_max_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_max_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
+_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_min_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_min_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_min_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_min_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
+_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_min_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_min_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_min_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_min_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
+_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_mul_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_mul_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_mul_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_mul_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
+_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_mul_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_mul_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_mul_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_mul_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
-             (__v4si) __W,
-             (__mmask8) __U);
+_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_abs_epi32(__A),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_abs_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
-             (__v8si) __W,
-             (__mmask8) __U);
+_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U,
+                                             (__v8si)_mm256_abs_epi32(__A),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U,
+                                             (__v8si)_mm256_abs_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3410,37 +3285,31 @@ _mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3496,37 +3365,31 @@ _mm256_max_epi64 (__m256i __A, __m256i __B) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epu32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epu32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epu32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epu32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3582,37 +3445,31 @@ _mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3668,37 +3525,31 @@ _mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epu32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epu32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epu32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epu32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -4095,132 +3946,115 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                (__v8si)(__m256i)(v1), (int)(scale)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
-              (__v2df) __W,
-              (__mmask8) __U);
+_mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sqrt_pd(__A),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
-              (__v2df)
-              _mm_setzero_pd (),
-              (__mmask8) __U);
+_mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sqrt_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
-              (__v4df) __W,
-              (__mmask8) __U);
+_mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sqrt_pd(__A),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
-              (__v4df)
-              _mm256_setzero_pd (),
-              (__mmask8) __U);
+_mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sqrt_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
-             (__v4sf) __W,
-             (__mmask8) __U);
+_mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sqrt_ps(__A),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
-             (__v4sf)
-             _mm_setzero_ps (),
-             (__mmask8) __U);
+_mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sqrt_ps(__A),
+                                             (__v4sf)_mm_setzero_pd());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
-             (__v8sf) __W,
-             (__mmask8) __U);
+_mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sqrt_ps(__A),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
-             (__v8sf)
-             _mm256_setzero_ps (),
-             (__mmask8) __U);
+_mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sqrt_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df) __W,
-             (__mmask8) __U);
+_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sub_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df)
-             _mm_setzero_pd (),
-             (__mmask8) __U);
+_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sub_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sub_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sub_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf) __W,
-            (__mmask8) __U);
+_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sub_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
+_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sub_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sub_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sub_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -4551,344 +4385,324 @@ _mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi8_epi32(__A),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi8_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi8_epi32(__A),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi8_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi8_epi64(__A),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi8_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi8_epi64(__A),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi8_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
 {
-  return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
-                (__v2di) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi32_epi64(__X),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
 {
-  return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi32_epi64(__X),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
 {
-  return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
-                (__v4di) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi32_epi64(__X),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
 {
-  return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi32_epi64(__X),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi16_epi32(__A),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi16_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi16_epi32(__A),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi16_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi16_epi64(__A),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi16_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi16_epi64(__A),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi16_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu8_epi32(__A),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu8_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu8_epi32(__A),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu8_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu8_epi64(__A),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu8_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu8_epi64(__A),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu8_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
 {
-  return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
-                (__v2di) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu32_epi64(__X),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
 {
-  return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu32_epi64(__X),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
 {
-  return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
-                (__v4di) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu32_epi64(__X),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
 {
-  return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu32_epi64(__X),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu16_epi32(__A),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu16_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu16_epi32(__A),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu16_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu16_epi64(__A),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu16_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu16_epi64(__A),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu16_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 
@@ -5125,125 +4939,132 @@ _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
                                         (__mmask8)(U)); })
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sll_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sll_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sll_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sll_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-#define _mm_mask_slli_epi32(W, U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)__W);
+}
 
-#define _mm_maskz_slli_epi32(U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
 
-#define _mm256_mask_slli_epi32(W, U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)__W);
+}
 
-#define _mm256_maskz_slli_epi32(U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sll_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sll_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sll_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sll_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-#define _mm_mask_slli_epi64(W, U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \
-                                         (__v2di)(__m128i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm_maskz_slli_epi64(U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \
-                                         (__v2di)_mm_setzero_di(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)__W);
+}
 
-#define _mm256_mask_slli_epi64(W, U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \
-                                         (__v4di)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
 
-#define _mm256_maskz_slli_epi64(U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)__W);
+}
 
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_rorv_epi32 (__m128i __A, __m128i __B)
@@ -5366,387 +5187,335 @@ _mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
-             (__v2di) __Y,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sllv_epi64(__X, __Y),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
-             (__v2di) __Y,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sllv_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
-             (__v4di) __Y,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
+                                            (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
-             (__v4di) __Y,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
+                                            (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sllv_epi32(__X, __Y),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sllv_epi32(__X, __Y),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
+                                            (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
-
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
-             (__v2di) __Y,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srlv_epi64(__X, __Y),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
-             (__v2di) __Y,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srlv_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
-             (__v4di) __Y,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
+                                            (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
-             (__v4di) __Y,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
+                                            (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srlv_epi32(__X, __Y),
+                                            (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srlv_epi32(__X, __Y),
+                                            (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
+                                            (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
-
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srl_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srl_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srl_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srl_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srli_epi32(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)__W);
+}
 
-#define _mm_maskz_srli_epi32(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
 
-#define _mm256_mask_srli_epi32(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)__W);
+}
 
-#define _mm256_maskz_srli_epi32(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srl_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srl_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srl_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srl_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srli_epi64(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)__W);
+}
 
-#define _mm_maskz_srli_epi64(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
 
-#define _mm256_mask_srli_epi64(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)__W);
+}
 
-#define _mm256_maskz_srli_epi64(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srav_epi32(__X, __Y),
+                                            (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srav_epi32(__X, __Y),
+                                            (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srav_epi32(__X, __Y),
+                                            (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srav_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srav_epi64 (__m128i __X, __m128i __Y)
+_mm_srav_epi64(__m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di)
-              _mm_setzero_di (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di) __W,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srav_epi64(__X, __Y),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di)
-              _mm_setzero_di (),
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srav_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srav_epi64 (__m256i __X, __m256i __Y)
+_mm256_srav_epi64(__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di) __W,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srav_epi64(__X, __Y),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srav_epi64(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -5975,6 +5744,7 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
                                                   (__v8si)_mm256_setzero_si256(), \
                                                   (__mmask8)(M)); })
 
+#ifdef __x86_64__
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
@@ -6006,6 +5776,7 @@ _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
                  _mm256_setzero_si256 (),
                  __M);
 }
+#endif
 
 #define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \
   (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
@@ -6653,85 +6424,67 @@ _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
                                       (__v8sf)_mm256_setzero_ps()); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
-      __m128i __C)
+_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
 {
-  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
-                 (__v2di) __C,
-                 (__v2df) __W,
-                 (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                            (__v2df)_mm_permutevar_pd(__A, __C),
+                                            (__v2df)__W);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
+_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C)
 {
-  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
-                 (__v2di) __C,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                            (__v2df)_mm_permutevar_pd(__A, __C),
+                                            (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
-         __m256i __C)
+_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C)
 {
-  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
-              (__v4di) __C,
-              (__v4df) __W,
-              (__mmask8)
-              __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                         (__v4df)_mm256_permutevar_pd(__A, __C),
+                                         (__v4df)__W);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
+_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C)
 {
-  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
-              (__v4di) __C,
-              (__v4df)
-              _mm256_setzero_pd (),
-              (__mmask8)
-              __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                         (__v4df)_mm256_permutevar_pd(__A, __C),
+                                         (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
-      __m128i __C)
+_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C)
 {
-  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
-                (__v4si) __C,
-                (__v4sf) __W,
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                            (__v4sf)_mm_permutevar_ps(__A, __C),
+                                            (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
+_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C)
 {
-  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
-                (__v4si) __C,
-                (__v4sf)
-                _mm_setzero_ps (),
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                            (__v4sf)_mm_permutevar_ps(__A, __C),
+                                            (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
-         __m256i __C)
+_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C)
 {
-  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
-                   (__v8si) __C,
-                   (__v8sf) __W,
-                   (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
+                                          (__v8sf)__W);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
+_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C)
 {
-  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
-                   (__v8si) __C,
-                   (__v8sf)
-                   _mm256_setzero_ps (),
-                   (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
+                                          (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __mmask8 __DEFAULT_FN_ATTRS
@@ -6985,154 +6738,156 @@ _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sra_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sra_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sra_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sra_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srai_epi32(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)__W);
+}
 
-#define _mm_maskz_srai_epi32(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
 
-#define _mm256_mask_srai_epi32(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)__W);
+}
 
-#define _mm256_maskz_srai_epi32(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sra_epi64 (__m128i __A, __m128i __B)
+_mm_sra_epi64(__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) -1);
+  return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                             (__v2di)_mm_sra_epi64(__A, __B), \
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                             (__v2di)_mm_sra_epi64(__A, __B), \
+                                             (__v2di)_mm_setzero_di());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_sra_epi64 (__m256i __A, __m128i __B)
+_mm256_sra_epi64(__m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) -1);
+  return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                           (__v4di)_mm256_sra_epi64(__A, __B), \
+                                           (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                           (__v4di)_mm256_sra_epi64(__A, __B), \
+                                           (__v4di)_mm256_setzero_si256());
 }
 
-#define _mm_srai_epi64(A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)_mm_setzero_di(), \
-                                         (__mmask8)-1); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi64(__m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
+}
 
-#define _mm_mask_srai_epi64(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                           (__v2di)_mm_srai_epi64(__A, __imm), \
+                                           (__v2di)__W);
+}
 
-#define _mm_maskz_srai_epi64(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                           (__v2di)_mm_srai_epi64(__A, __imm), \
+                                           (__v2di)_mm_setzero_di());
+}
 
-#define _mm256_srai_epi64(A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)-1); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi64(__m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
+}
 
-#define _mm256_mask_srai_epi64(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
+                                        (__v4di)__W);
+}
 
-#define _mm256_maskz_srai_epi64(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
+                                        (__v4di)_mm256_setzero_si256());
+}
 
 #define _mm_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
   (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
@@ -8473,79 +8228,84 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 }
 
 #define _mm256_extractf32x4_ps(A, imm) __extension__ ({ \
-  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                               (int)(imm), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1); })
+  (__m128)__builtin_shufflevector((__v8sf)(__m256)(A),           \
+                                  (__v8sf)_mm256_undefined_ps(), \
+                                  ((imm) & 1) ? 4 : 0,           \
+                                  ((imm) & 1) ? 5 : 1,           \
+                                  ((imm) & 1) ? 6 : 2,           \
+                                  ((imm) & 1) ? 7 : 3); })
 
 #define _mm256_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({ \
-  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                               (int)(imm), \
-                                               (__v4sf)(__m128)(W), \
-                                               (__mmask8)(U)); })
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)(W)); })
 
 #define _mm256_maskz_extractf32x4_ps(U, A, imm) __extension__ ({ \
-  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                               (int)(imm), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(U)); })
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)_mm_setzero_ps()); })
 
 #define _mm256_extracti32x4_epi32(A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v4si)_mm_setzero_si128(), \
-                                                (__mmask8)-1); })
+  (__m128i)__builtin_shufflevector((__v8si)(__m256)(A),              \
+                                   (__v8si)_mm256_undefined_si256(), \
+                                   ((imm) & 1) ? 4 : 0,              \
+                                   ((imm) & 1) ? 5 : 1,              \
+                                   ((imm) & 1) ? 6 : 2,              \
+                                   ((imm) & 1) ? 7 : 3); })
 
 #define _mm256_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v4si)(__m128i)(W), \
-                                                (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)(W)); })
 
 #define _mm256_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v4si)_mm_setzero_si128(), \
-                                                (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)_mm_setzero_si128()); })
 
 #define _mm256_insertf32x4(A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
-                                              (__v4sf)(__m128)(B), (int)(imm), \
-                                              (__v8sf)_mm256_setzero_ps(), \
-                                              (__mmask8)-1); })
+  (__m256)__builtin_shufflevector((__v8sf)(A), \
+                                  (__v8sf)_mm256_castps128_ps256((__m128)(B)), \
+                                  ((imm) & 0x1) ?  0 :  8, \
+                                  ((imm) & 0x1) ?  1 :  9, \
+                                  ((imm) & 0x1) ?  2 : 10, \
+                                  ((imm) & 0x1) ?  3 : 11, \
+                                  ((imm) & 0x1) ?  8 :  4, \
+                                  ((imm) & 0x1) ?  9 :  5, \
+                                  ((imm) & 0x1) ? 10 :  6, \
+                                  ((imm) & 0x1) ? 11 :  7); })
 
 #define _mm256_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
-                                              (__v4sf)(__m128)(B), (int)(imm), \
-                                              (__v8sf)(__m256)(W), \
-                                              (__mmask8)(U)); })
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+                                  (__v8sf)(W)); })
 
 #define _mm256_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
-                                              (__v4sf)(__m128)(B), (int)(imm), \
-                                              (__v8sf)_mm256_setzero_ps(), \
-                                              (__mmask8)(U)); })
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+                                  (__v8sf)_mm256_setzero_ps()); })
 
 #define _mm256_inserti32x4(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
-                                               (__v4si)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8si)_mm256_setzero_si256(), \
-                                               (__mmask8)-1); })
+  (__m256i)__builtin_shufflevector((__v8si)(A), \
+                                 (__v8si)_mm256_castsi128_si256((__m128i)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
 
 #define _mm256_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
-                                               (__v4si)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8si)(__m256i)(W), \
-                                               (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+                                  (__v8si)(W)); })
 
 #define _mm256_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
-                                               (__v4si)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8si)_mm256_setzero_si256(), \
-                                               (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+                                  (__v8si)_mm256_setzero_si256()); })
 
 #define _mm_getmant_pd(A, B, C) __extension__({\
   (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
@@ -8860,76 +8620,78 @@ _mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
 }
 
 #define _mm_alignr_epi32(A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
-                                         (__v4si)(__m128i)(B), (int)(imm), \
-                                         (__v4si)_mm_undefined_si128(), \
-                                         (__mmask8)-1); })
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(B), \
+                                   (__v4si)(__m128i)(A), \
+                                   ((int)(imm) & 0x3) + 0, \
+                                   ((int)(imm) & 0x3) + 1, \
+                                   ((int)(imm) & 0x3) + 2, \
+                                   ((int)(imm) & 0x3) + 3); })
 
 #define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
-                                         (__v4si)(__m128i)(B), (int)(imm), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+                                    (__v4si)(__m128i)(W)); })
 
 #define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
-                                         (__v4si)(__m128i)(B), (int)(imm), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+                                    (__v4si)_mm_setzero_si128()); })
 
 #define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
-                                         (__v8si)(__m256i)(B), (int)(imm), \
-                                         (__v8si)_mm256_undefined_si256(), \
-                                         (__mmask8)-1); })
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(B), \
+                                   (__v8si)(__m256i)(A), \
+                                   ((int)(imm) & 0x7) + 0, \
+                                   ((int)(imm) & 0x7) + 1, \
+                                   ((int)(imm) & 0x7) + 2, \
+                                   ((int)(imm) & 0x7) + 3, \
+                                   ((int)(imm) & 0x7) + 4, \
+                                   ((int)(imm) & 0x7) + 5, \
+                                   ((int)(imm) & 0x7) + 6, \
+                                   ((int)(imm) & 0x7) + 7); })
 
 #define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
-                                         (__v8si)(__m256i)(B), (int)(imm), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+                                 (__v8si)(__m256i)(W)); })
 
 #define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
-                                         (__v8si)(__m256i)(B), (int)(imm), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+                                 (__v8si)_mm256_setzero_si256()); })
 
 #define _mm_alignr_epi64(A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
-                                         (__v2di)(__m128i)(B), (int)(imm), \
-                                         (__v2di)_mm_setzero_di(), \
-                                         (__mmask8)-1); })
+  (__m128i)__builtin_shufflevector((__v2di)(__m128i)(B), \
+                                   (__v2di)(__m128i)(A), \
+                                   ((int)(imm) & 0x1) + 0, \
+                                   ((int)(imm) & 0x1) + 1); })
 
 #define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
-                                         (__v2di)(__m128i)(B), (int)(imm), \
-                                         (__v2di)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+                                    (__v2di)(__m128i)(W)); })
 
 #define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
-                                         (__v2di)(__m128i)(B), (int)(imm), \
-                                         (__v2di)_mm_setzero_di(), \
-                                         (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+                                    (__v2di)_mm_setzero_di()); })
 
 #define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
-                                         (__v4di)(__m256i)(B), (int)(imm), \
-                                         (__v4di)_mm256_undefined_pd(), \
-                                         (__mmask8)-1); })
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(B), \
+                                   (__v4di)(__m256i)(A), \
+                                   ((int)(imm) & 0x3) + 0, \
+                                   ((int)(imm) & 0x3) + 1, \
+                                   ((int)(imm) & 0x3) + 2, \
+                                   ((int)(imm) & 0x3) + 3); })
 
 #define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
-                                         (__v4di)(__m256i)(B), (int)(imm), \
-                                         (__v4di)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+                                 (__v4di)(__m256i)(W)); })
 
 #define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
-                                         (__v4di)(__m256i)(B), (int)(imm), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+                                 (__v4di)_mm256_setzero_si256()); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h
index 32e8546817b3..be03ba346031 100644
--- a/lib/Headers/avxintrin.h
+++ b/lib/Headers/avxintrin.h
@@ -57,7 +57,7 @@ typedef long long __m256i __attribute__((__vector_size__(32)));
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
+/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -75,7 +75,7 @@ _mm256_add_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
+/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -93,7 +93,7 @@ _mm256_add_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
+/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing the minuend.
@@ -111,7 +111,7 @@ _mm256_sub_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
+/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the minuend.
@@ -130,7 +130,7 @@ _mm256_sub_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing the left source operand.
@@ -149,7 +149,7 @@ _mm256_addsub_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the left source operand.
@@ -167,7 +167,7 @@ _mm256_addsub_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
+/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing the dividend.
@@ -185,7 +185,7 @@ _mm256_div_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
+/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the dividend.
@@ -204,7 +204,7 @@ _mm256_div_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
+/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the operands.
@@ -223,7 +223,7 @@ _mm256_max_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
+/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the operands.
@@ -242,7 +242,7 @@ _mm256_max_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
+/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the operands.
@@ -261,7 +261,7 @@ _mm256_min_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
+/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the operands.
@@ -279,7 +279,7 @@ _mm256_min_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
+/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the operands.
@@ -297,7 +297,7 @@ _mm256_mul_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
+/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the operands.
@@ -316,7 +316,7 @@ _mm256_mul_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
+/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double].
@@ -333,7 +333,7 @@ _mm256_sqrt_pd(__m256d __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
+/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -350,7 +350,7 @@ _mm256_sqrt_ps(__m256 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
+/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -367,7 +367,7 @@ _mm256_rsqrt_ps(__m256 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
+/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -389,24 +389,24 @@ _mm256_rcp_ps(__m256 __a)
 /// __m256d _mm256_round_pd(__m256d V, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [4 x double].
 /// \param M
-///    An integer value that specifies the rounding operation.
-///    Bits [7:4] are reserved.
-///    Bit [3] is a precision exception value:
-///    0: A normal PE exception is used.
-///    1: The PE field is not updated.
-///    Bit [2] is the rounding control source:
-///    0: Use bits [1:0] of M.
-///    1: Use the current MXCSR setting.
-///    Bits [1:0] contain the rounding control definition:
-///    00: Nearest.
-///    01: Downward (toward negative infinity).
-///    10: Upward (toward positive infinity).
-///    11: Truncated.
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used. \n
+///      1: The PE field is not updated. \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M. \n
+///      1: Use the current MXCSR setting. \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest. \n
+///      01: Downward (toward negative infinity). \n
+///      10: Upward (toward positive infinity). \n
+///      11: Truncated.
 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
 #define _mm256_round_pd(V, M) __extension__ ({ \
     (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
@@ -421,24 +421,24 @@ _mm256_rcp_ps(__m256 __a)
 /// __m256 _mm256_round_ps(__m256 V, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [8 x float].
 /// \param M
-///    An integer value that specifies the rounding operation.
-///    Bits [7:4] are reserved.
-///    Bit [3] is a precision exception value:
-///    0: A normal PE exception is used.
-///    1: The PE field is not updated.
-///    Bit [2] is the rounding control source:
-///    0: Use bits [1:0] of M.
-///    1: Use the current MXCSR setting.
-///    Bits [1:0] contain the rounding control definition:
-///    00: Nearest.
-///    01: Downward (toward negative infinity).
-///    10: Upward (toward positive infinity).
-///    11: Truncated.
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used. \n
+///      1: The PE field is not updated. \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M. \n
+///      1: Use the current MXCSR setting. \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest. \n
+///      01: Downward (toward negative infinity). \n
+///      10: Upward (toward positive infinity). \n
+///      11: Truncated.
 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
 #define _mm256_round_ps(V, M) __extension__ ({ \
   (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
@@ -453,7 +453,7 @@ _mm256_rcp_ps(__m256 __a)
 /// __m256d _mm256_ceil_pd(__m256d V);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [4 x double].
@@ -470,7 +470,7 @@ _mm256_rcp_ps(__m256 __a)
 /// __m256d _mm256_floor_pd(__m256d V);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [4 x double].
@@ -488,7 +488,7 @@ _mm256_rcp_ps(__m256 __a)
 /// __m256 _mm256_ceil_ps(__m256 V);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [8 x float].
@@ -505,7 +505,7 @@ _mm256_rcp_ps(__m256 __a)
 /// __m256 _mm256_floor_ps(__m256 V);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [8 x float].
@@ -517,7 +517,7 @@ _mm256_rcp_ps(__m256 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
+/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -535,7 +535,7 @@ _mm256_and_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
+/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -554,7 +554,7 @@ _mm256_and_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
+/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing the left source operand. The
@@ -575,7 +575,7 @@ _mm256_andnot_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
+/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the left source operand. The
@@ -595,7 +595,7 @@ _mm256_andnot_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
+/// This intrinsic corresponds to the <c> VORPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -613,7 +613,7 @@ _mm256_or_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
+/// This intrinsic corresponds to the <c> VORPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -631,7 +631,7 @@ _mm256_or_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
+/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -649,7 +649,7 @@ _mm256_xor_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -669,7 +669,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -692,7 +692,7 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -715,7 +715,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -738,7 +738,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -762,23 +762,23 @@ _mm256_hsub_ps(__m256 __a, __m256 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \param __c
 ///    A 128-bit integer vector operand specifying how the values are to be
-///    copied.
-///    Bit [1]:
-///    0: Bits [63:0] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    Bit [65]:
-///    0: Bits [63:0] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [127:64] of the
-///    returned vector.
+///    copied. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [65]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm_permutevar_pd(__m128d __a, __m128i __c)
@@ -786,37 +786,37 @@ _mm_permutevar_pd(__m128d __a, __m128i __c)
   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
 }
 
-/// \brief Copies the values in a 256-bit vector of [4 x double] as
-///    specified by the 256-bit integer vector operand.
+/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
+///    by the 256-bit integer vector operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double].
 /// \param __c
 ///    A 256-bit integer vector operand specifying how the values are to be
-///    copied.
-///    Bit [1]:
-///    0: Bits [63:0] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    Bit [65]:
-///    0: Bits [63:0] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    Bit [129]:
-///    0: Bits [191:128] of the source are copied to bits [191:128] of the
-///    returned vector.
-///    1: Bits [255:192] of the source are copied to bits [191:128] of the
-///    returned vector.
-///    Bit [193]:
-///    0: Bits [191:128] of the source are copied to bits [255:192] of the
-///    returned vector.
-///    1: Bits [255:192] of the source are copied to bits [255:192] of the
+///    copied. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [65]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///    Bit [129]: \n
+///      0: Bits [191:128] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///    Bit [193]: \n
+///      0: Bits [191:128] of the source are copied to bits [255:192] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [255:192] of the
 ///    returned vector.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 static __inline __m256d __DEFAULT_FN_ATTRS
@@ -827,52 +827,51 @@ _mm256_permutevar_pd(__m256d __a, __m256i __c)
 
 /// \brief Copies the values stored in a 128-bit vector of [4 x float] as
 ///    specified by the 128-bit integer vector operand.
-///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \param __c
 ///    A 128-bit integer vector operand specifying how the values are to be
-///    copied.
-///    Bits [1:0]:
-///    00: Bits [31:0] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    Bits [33:32]:
-///    00: Bits [31:0] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    Bits [65:64]:
-///    00: Bits [31:0] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    Bits [97:96]:
-///    00: Bits [31:0] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [127:96] of the
-///    returned vector.
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [33:32]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [65:64]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [97:96]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_permutevar_ps(__m128 __a, __m128i __c)
@@ -885,85 +884,85 @@ _mm_permutevar_ps(__m128 __a, __m128i __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
 /// \param __c
 ///    A 256-bit integer vector operand specifying how the values are to be
-///    copied.
-///    Bits [1:0]:
-///    00: Bits [31:0] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    Bits [33:32]:
-///    00: Bits [31:0] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    Bits [65:64]:
-///    00: Bits [31:0] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    Bits [97:96]:
-///    00: Bits [31:0] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    Bits [129:128]:
-///    00: Bits [159:128] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    Bits [161:160]:
-///    00: Bits [159:128] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    Bits [193:192]:
-///    00: Bits [159:128] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    Bits [225:224]:
-///    00: Bits [159:128] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [255:224] of the
-///    returned vector.
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [33:32]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [65:64]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [97:96]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///    Bits [129:128]: \n
+///      00: Bits [159:128] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///    Bits [161:160]: \n
+///      00: Bits [159:128] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///    Bits [193:192]: \n
+///      00: Bits [159:128] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///    Bits [225:224]: \n
+///      00: Bits [159:128] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [255:224] of the
+///          returned vector.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_permutevar_ps(__m256 __a, __m256i __c)
@@ -971,8 +970,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
 }
 
-/// \brief Copies the values in a 128-bit vector of [2 x double] as
-///    specified by the immediate integer operand.
+/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
+///    by the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -980,30 +979,31 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m128d _mm_permute_pd(__m128d A, const int C);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 ///
 /// \param A
 ///    A 128-bit vector of [2 x double].
 /// \param C
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bit [0]:
-///    0: Bits [63:0] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    Bit [1]:
-///    0: Bits [63:0] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [127:64] of the
-///    returned vector.
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bit [0]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_permute_pd(A, C) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
                                    (__v2df)_mm_undefined_pd(), \
                                    ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
 
-/// \brief Copies the values in a 256-bit vector of [4 x double] as
-///    specified by the immediate integer operand.
+/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
+///    the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1011,32 +1011,33 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m256d _mm256_permute_pd(__m256d A, const int C);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 ///
 /// \param A
 ///    A 256-bit vector of [4 x double].
 /// \param C
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bit [0]:
-///    0: Bits [63:0] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    Bit [1]:
-///    0: Bits [63:0] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    Bit [2]:
-///    0: Bits [191:128] of the source are copied to bits [191:128] of the
-///    returned vector.
-///    1: Bits [255:192] of the source are copied to bits [191:128] of the
-///    returned vector.
-///    Bit [3]:
-///    0: Bits [191:128] of the source are copied to bits [255:192] of the
-///    returned vector.
-///    1: Bits [255:192] of the source are copied to bits [255:192] of the
-///    returned vector.
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bit [0]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///    Bit [2]: \n
+///      0: Bits [191:128] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///    Bit [3]: \n
+///      0: Bits [191:128] of the source are copied to bits [255:192] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [255:192] of the
+///         returned vector.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute_pd(A, C) __extension__ ({ \
   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
@@ -1046,8 +1047,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
                                    2 + (((C) >> 2) & 0x1), \
                                    2 + (((C) >> 3) & 0x1)); })
 
-/// \brief Copies the values in a 128-bit vector of [4 x float] as
-///    specified by the immediate integer operand.
+/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
+///    the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1055,48 +1056,49 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m128 _mm_permute_ps(__m128 A, const int C);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 ///
 /// \param A
 ///    A 128-bit vector of [4 x float].
 /// \param C
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bits [1:0]:
-///    00: Bits [31:0] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    Bits [3:2]:
-///    00: Bits [31:0] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    Bits [5:4]:
-///    00: Bits [31:0] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    Bits [7:6]:
-///    00: Bits [31:0] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [127:96] of the
-///    returned vector.
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_permute_ps(A, C) __extension__ ({ \
   (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
@@ -1104,8 +1106,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
 
-/// \brief Copies the values in a 256-bit vector of [8 x float] as
-///    specified by the immediate integer operand.
+/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
+///    the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1113,84 +1115,85 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m256 _mm256_permute_ps(__m256 A, const int C);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 ///
 /// \param A
 ///    A 256-bit vector of [8 x float].
 /// \param C
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bits [1:0]:
-///    00: Bits [31:0] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    Bits [3:2]:
-///    00: Bits [31:0] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    Bits [5:4]:
-///    00: Bits [31:0] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    Bits [7:6]:
-///    00: Bits [31:0] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    Bits [1:0]:
-///    00: Bits [159:128] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    Bits [3:2]:
-///    00: Bits [159:128] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    Bits [5:4]:
-///    00: Bits [159:128] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    Bits [7:6]:
-///    00: Bits [159:128] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [255:224] of the
-///    returned vector.
+///    An immediate integer operand specifying how the values are to be \n
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///    Bits [1:0]: \n
+///      00: Bits [159:128] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [159:128] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [159:128] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [159:128] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [255:224] of the
+///          returned vector.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute_ps(A, C) __extension__ ({ \
   (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
@@ -1213,7 +1216,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit vector of [4 x double].
@@ -1221,25 +1224,25 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    A 256-bit vector of [4 x double.
 /// \param M
 ///    An immediate integer operand specifying how the values are to be
-///    permuted.
-///    Bits [1:0]:
-///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    Bits [5:4]:
-///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
-///    destination.
+///    permuted. \n
+///    Bits [1:0]: \n
+///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///          destination. \n
+///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///          destination. \n
+///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///          destination. \n
+///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///          destination. \n
+///    Bits [5:4]: \n
+///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///          destination. \n
+///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///          destination. \n
+///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///          destination. \n
+///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+///          destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
@@ -1254,7 +1257,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit vector of [8 x float].
@@ -1262,24 +1265,24 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    A 256-bit vector of [8 x float].
 /// \param M
 ///    An immediate integer operand specifying how the values are to be
-///    permuted.
-///    Bits [1:0]:
-///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    Bits [5:4]:
-///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+///    permuted. \n
+///    Bits [1:0]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    Bits [5:4]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
 ///    destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
@@ -1295,7 +1298,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit integer vector.
@@ -1303,23 +1306,23 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    A 256-bit integer vector.
 /// \param M
 ///    An immediate integer operand specifying how the values are to be copied.
-///    Bits [1:0]:
-///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    Bits [5:4]:
-///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+///    Bits [1:0]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    Bits [5:4]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
 ///    destination.
 /// \returns A 256-bit integer vector containing the copied values.
 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
@@ -1337,7 +1340,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
+/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit vector of [4 x double].
@@ -1347,9 +1350,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    An immediate integer operand, with mask bits [3:0] specifying how the
 ///    values are to be copied. The position of the mask bit corresponds to the
 ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
-///    element in operand V1 is copied to the same position in the destination.
-///    When a mask bit is 1, the corresponding 64-bit element in operand V2 is
-///    copied to the same position in the destination.
+///    element in operand \a V1 is copied to the same position in the
+///    destination. When a mask bit is 1, the corresponding 64-bit element in
+///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
@@ -1369,7 +1372,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
+/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit vector of [8 x float].
@@ -1379,9 +1382,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    An immediate integer operand, with mask bits [7:0] specifying how the
 ///    values are to be copied. The position of the mask bit corresponds to the
 ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
-///    element in operand V1 is copied to the same position in the destination.
-///    When a mask bit is 1, the corresponding 32-bit element in operand V2 is
-///    copied to the same position in the destination.
+///    element in operand \a V1 is copied to the same position in the
+///    destination. When a mask bit is 1, the corresponding 32-bit element in
+///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
@@ -1401,7 +1404,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
+/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double].
@@ -1411,9 +1414,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
 ///    how the values are to be copied. The position of the mask bit corresponds
 ///    to the most significant bit of a copied value. When a mask bit is 0, the
-///    corresponding 64-bit element in operand __a is copied to the same
+///    corresponding 64-bit element in operand \a __a is copied to the same
 ///    position in the destination. When a mask bit is 1, the corresponding
-///    64-bit element in operand __b is copied to the same position in the
+///    64-bit element in operand \a __b is copied to the same position in the
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 static __inline __m256d __DEFAULT_FN_ATTRS
@@ -1429,7 +1432,7 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
+/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -1439,9 +1442,9 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
 ///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
 ///    and 31 specifying how the values are to be copied. The position of the
 ///    mask bit corresponds to the most significant bit of a copied value. When
-///    a mask bit is 0, the corresponding 32-bit element in operand __a is
+///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
 ///    copied to the same position in the destination. When a mask bit is 1, the
-///    corresponding 32-bit element in operand __b is copied to the same
+///    corresponding 32-bit element in operand \a __b is copied to the same
 ///    position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 static __inline __m256 __DEFAULT_FN_ATTRS
@@ -1455,12 +1458,12 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \brief Computes two dot products in parallel, using the lower and upper
 ///    halves of two [8 x float] vectors as input to the two computations, and
 ///    returning the two dot products in the lower and upper halves of the
-///    [8 x float] result. The immediate integer operand controls which
-///    input elements will contribute to the dot product, and where the final
-///    results are returned. In general, for each dot product, the four
-///    corresponding elements of the input vectors are multiplied; the first
-///    two and second two products are summed, then the two sums are added to
-///    form the final result.
+///    [8 x float] result. The immediate integer operand controls which input
+///    elements will contribute to the dot product, and where the final results
+///    are returned. In general, for each dot product, the four corresponding
+///    elements of the input vectors are multiplied; the first two and second
+///    two products are summed, then the two sums are added to form the final
+///    result.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1468,7 +1471,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
+/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
 ///
 /// \param V1
 ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
@@ -1510,7 +1513,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
+/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
 ///
 /// \param a
 ///    A 256-bit vector of [8 x float]. The four selected elements in this
@@ -1522,22 +1525,23 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    destination, according to the bits specified in the immediate operand.
 /// \param mask
 ///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from a and b. Bits [3:0] specify the values copied from operand a.
-///    Bits [7:4] specify the values copied from operand b.
+///    copy from \a a and \a b \n.
+///    Bits [3:0] specify the values copied from operand \a a. \n
+///    Bits [7:4] specify the values copied from operand \a b. \n
 ///    The destinations within the 256-bit destination are assigned values as
-///    follows, according to the bit value assignments described below:
+///    follows, according to the bit value assignments described below: \n
 ///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
-///    destination.
+///    destination. \n
 ///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
-///    destination.
+///    destination. \n
 ///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
-///    destination.
+///    destination. \n
 ///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
-///    the destination.
-///    Bit value assignments:
-///    00: Bits [31:0] and [159:128] are copied from the selected operand.
-///    01: Bits [63:32] and [191:160] are copied from the selected operand.
-///    10: Bits [95:64] and [223:192] are copied from the selected operand.
+///    the destination. \n
+///    Bit value assignments: \n
+///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
+///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
+///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
 ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
@@ -1567,7 +1571,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
+/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
 ///
 /// \param a
 ///    A 256-bit vector of [4 x double].
@@ -1575,22 +1579,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    A 256-bit vector of [4 x double].
 /// \param mask
 ///    An immediate value containing 8-bit values specifying which elements to
-///    copy from a and b:
-///    Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
-///    destination.
-///    Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
-///    destination.
-///    Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
-///    destination.
-///    Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
-///    destination.
-///    Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
-///    destination.
-///    Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
-///    destination.
-///    Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
-///    destination.
-///    Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
+///    copy from \a a and \a b: \n
+///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
+///    destination. \n
+///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
+///    destination. \n
+///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
+///    destination. \n
+///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
+///    destination. \n
+///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
+///    destination. \n
+///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
+///    destination. \n
+///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
+///    destination. \n
+///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
@@ -1647,7 +1651,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [2 x double].
@@ -1655,16 +1659,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    A 128-bit vector of [2 x double].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
 ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands)
+///                        (swapped operands) \n
 ///    07h, 0Fh, 17h, 1Fh: Ordered
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
@@ -1683,7 +1688,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [4 x float].
@@ -1691,16 +1696,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    A 128-bit vector of [4 x float].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
 ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                       (swapped operands)
+///                        (swapped operands) \n
 ///    07h, 0Fh, 17h, 1Fh: Ordered
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
@@ -1719,7 +1725,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
 ///
 /// \param a
 ///    A 256-bit vector of [4 x double].
@@ -1727,16 +1733,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    A 256-bit vector of [4 x double].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
 ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands)
+///                        (swapped operands) \n
 ///    07h, 0Fh, 17h, 1Fh: Ordered
 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
@@ -1755,7 +1762,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
 ///
 /// \param a
 ///    A 256-bit vector of [8 x float].
@@ -1763,16 +1770,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    A 256-bit vector of [8 x float].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
 ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                       (swapped operands)
+///                        (swapped operands) \n
 ///    07h, 0Fh, 17h, 1Fh: Ordered
 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
@@ -1790,7 +1798,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
+/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [2 x double].
@@ -1798,16 +1806,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    A 128-bit vector of [2 x double].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
 ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                       (swapped operands)
+///                        (swapped operands) \n
 ///    07h, 0Fh, 17h, 1Fh: Ordered
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
@@ -1825,7 +1834,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
+/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [4 x float].
@@ -1833,16 +1842,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    A 128-bit vector of [4 x float].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
 ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                       (swapped operands)
+///                        (swapped operands) \n
 ///    07h, 0Fh, 17h, 1Fh: Ordered
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
@@ -1854,8 +1864,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-///   EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x i32].
@@ -1876,8 +1886,8 @@ _mm256_extract_epi32(__m256i __a, const int __imm)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-///    EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 256-bit integer vector of [16 x i16].
@@ -1898,8 +1908,8 @@ _mm256_extract_epi16(__m256i __a, const int __imm)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-///    EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 256-bit integer vector of [32 x i8].
@@ -1921,8 +1931,8 @@ _mm256_extract_epi8(__m256i __a, const int __imm)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-///    EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 256-bit integer vector of [4 x i64].
@@ -1945,8 +1955,8 @@ _mm256_extract_epi64(__m256i __a, const int __imm)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-///    INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A vector of [8 x i32] to be used by the insert operation.
@@ -1955,8 +1965,8 @@ _mm256_extract_epi64(__m256i __a, const int __imm)
 /// \param __imm
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-///     with __b.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
 {
@@ -1972,8 +1982,8 @@ _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-///    INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A vector of [16 x i16] to be used by the insert operation.
@@ -1982,8 +1992,8 @@ _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
 /// \param __imm
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-///     with __b.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
 {
@@ -1998,8 +2008,8 @@ _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-///    INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A vector of [32 x i8] to be used by the insert operation.
@@ -2008,8 +2018,8 @@ _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
 /// \param __imm
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-///    with __b.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
 {
@@ -2025,8 +2035,8 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-///    INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A vector of [4 x i64] to be used by the insert operation.
@@ -2035,8 +2045,8 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
 /// \param __imm
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-///     with __b.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///     \a __imm with \a __b.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
 {
@@ -2051,7 +2061,7 @@ _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector of [4 x i32].
@@ -2066,7 +2076,7 @@ _mm256_cvtepi32_pd(__m128i __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit integer vector.
@@ -2082,7 +2092,7 @@ _mm256_cvtepi32_ps(__m256i __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double].
@@ -2097,7 +2107,7 @@ _mm256_cvtpd_ps(__m256d __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -2108,24 +2118,66 @@ _mm256_cvtps_epi32(__m256 __a)
   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
 }
 
+/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
+///    x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtps_pd(__m128 __a)
 {
   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
 }
 
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+///    x i32], truncating the result by rounding towards zero when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_cvttpd_epi32(__m256d __a)
 {
   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
 }
 
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+///    x i32]. When a conversion is inexact, the value returned is rounded
+///    according to the rounding control bits in the MXCSR register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_cvtpd_epi32(__m256d __a)
 {
   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
 }
 
+/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
+///    truncating the result by rounding towards zero when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_cvttps_epi32(__m256 __a)
 {
@@ -2152,18 +2204,73 @@ _mm256_cvtss_f32(__m256 __a)
 }
 
 /* Vector replicate */
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
+///    vector of [8 x float] to float values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
+///    the return value. \n
+///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
+///    the return value. \n
+///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
+///    return value. \n
+///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
+///    return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+///    values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_movehdup_ps(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
 }
 
+/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
+///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
+///    the return value. \n
+///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
+///    the return value. \n
+///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
+///    return value. \n
+///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
+///    return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+///    values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_moveldup_ps(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
 }
 
+/// \brief Moves and duplicates double-precision floating point values from a
+///    256-bit vector of [4 x double] to double-precision values in a 256-bit
+///    vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double]. \n
+///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
+///    return value. \n
+///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
+///    the return value.
+/// \returns A 256-bit vector of [4 x double] containing the moved and
+///    duplicated values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_movedup_pd(__m256d __a)
 {
@@ -2171,24 +2278,98 @@ _mm256_movedup_pd(__m256d __a)
 }
 
 /* Unpack and Interleave */
+/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
+///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [127:64] are written to bits [63:0] of the return value. \n
+///    Bits [255:192] are written to bits [191:128] of the return value. \n
+/// \param __b
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the return value. \n
+///    Bits [255:192] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
 }
 
+/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
+///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the return value. \n
+///    Bits [191:128] are written to bits [191:128] of the return value.
+/// \param __b
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [63:0] are written to bits [127:64] of the return value. \n
+///    Bits [191:128] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
 }
 
+/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
+///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [95:64] are written to bits [31:0] of the return value. \n
+///    Bits [127:96] are written to bits [95:64] of the return value. \n
+///    Bits [223:192] are written to bits [159:128] of the return value. \n
+///    Bits [255:224] are written to bits [223:192] of the return value.
+/// \param __b
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [95:64] are written to bits [63:32] of the return value. \n
+///    Bits [127:96] are written to bits [127:96] of the return value. \n
+///    Bits [223:192] are written to bits [191:160] of the return value. \n
+///    Bits [255:224] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
 }
 
+/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
+///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [31:0] are written to bits [31:0] of the return value. \n
+///    Bits [63:32] are written to bits [95:64] of the return value. \n
+///    Bits [159:128] are written to bits [159:128] of the return value. \n
+///    Bits [191:160] are written to bits [223:192] of the return value.
+/// \param __b
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [31:0] are written to bits [63:32] of the return value. \n
+///    Bits [63:32] are written to bits [127:96] of the return value. \n
+///    Bits [159:128] are written to bits [191:160] of the return value. \n
+///    Bits [191:160] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
 {
@@ -2196,90 +2377,401 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b)
 }
 
 /* Bit Test */
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns the ZF flag in the EFLAGS register.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testz_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns the CF flag in the EFLAGS register.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testc_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testnzc_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns the ZF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testz_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns the CF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testc_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testnzc_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns the ZF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns the CF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns the ZF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns the CF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns the ZF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
 }
 
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns the CF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
 }
 
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_si256(__m256i __a, __m256i __b)
 {
@@ -2287,12 +2779,36 @@ _mm256_testnzc_si256(__m256i __a, __m256i __b)
 }
 
 /* Vector extract sign mask */
+/// \brief Extracts the sign bits of double-precision floating point elements
+///    in a 256-bit vector of [4 x double] and writes them to the lower order
+///    bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the double-precision
+///    floating point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [3:0].
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_movemask_pd(__m256d __a)
 {
   return __builtin_ia32_movmskpd256((__v4df)__a);
 }
 
+/// \brief Extracts the sign bits of double-precision floating point elements
+///    in a 256-bit vector of [8 x float] and writes them to the lower order
+///    bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the double-precision floating
+///    point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [7:0].
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_movemask_ps(__m256 __a)
 {
@@ -2300,12 +2816,22 @@ _mm256_movemask_ps(__m256 __a)
 }
 
 /* Vector __zero */
+/// \brief Zeroes the contents of all XMM or YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_zeroall(void)
 {
   __builtin_ia32_vzeroall();
 }
 
+/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_zeroupper(void)
 {
@@ -2313,6 +2839,18 @@ _mm256_zeroupper(void)
 }
 
 /* Vector load with broadcast */
+/// \brief Loads a scalar single-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [4 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+///    The single-precision floating point value to be broadcast.
+/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
+///    equal to the broadcast value.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_broadcast_ss(float const *__a)
 {
@@ -2320,6 +2858,18 @@ _mm_broadcast_ss(float const *__a)
   return (__m128)(__v4sf){ __f, __f, __f, __f };
 }
 
+/// \brief Loads a scalar double-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [4 x double] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
+///
+/// \param __a
+///    The double-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
+///    equal to the broadcast value.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_sd(double const *__a)
 {
@@ -2327,6 +2877,18 @@ _mm256_broadcast_sd(double const *__a)
   return (__m256d)(__v4df){ __d, __d, __d, __d };
 }
 
+/// \brief Loads a scalar single-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [8 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+///    The single-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
+///    equal to the broadcast value.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ss(float const *__a)
 {
@@ -2334,12 +2896,36 @@ _mm256_broadcast_ss(float const *__a)
   return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
 }
 
+/// \brief Loads the data from a 128-bit vector of [2 x double] from the
+///    specified address pointed to by \a __a and broadcasts it to 128-bit
+///    elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+///    The 128-bit vector of [2 x double] to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
+///    equal to the broadcast value.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_pd(__m128d const *__a)
 {
   return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
 }
 
+/// \brief Loads the data from a 128-bit vector of [4 x float] from the
+///    specified address pointed to by \a __a and broadcasts it to 128-bit
+///    elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+///    The 128-bit vector of [4 x float] to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
+///    equal to the broadcast value.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ps(__m128 const *__a)
 {
@@ -2347,18 +2933,50 @@ _mm256_broadcast_ps(__m128 const *__a)
 }
 
 /* SIMD load ops */
+/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
+///    memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing
+///    double-precision floating point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_load_pd(double const *__p)
 {
   return *(__m256d *)__p;
 }
 
+/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
+///    memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing float values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_load_ps(float const *__p)
 {
   return *(__m256 *)__p;
 }
 
+/// \brief Loads 4 double-precision floating point values from an unaligned
+///    memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing double-precision floating
+///    point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu_pd(double const *__p)
 {
@@ -2368,6 +2986,17 @@ _mm256_loadu_pd(double const *__p)
   return ((struct __loadu_pd*)__p)->__v;
 }
 
+/// \brief Loads 8 single-precision floating point values from an unaligned
+///    memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing single-precision floating
+///    point values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu_ps(float const *__p)
 {
@@ -2377,12 +3006,33 @@ _mm256_loadu_ps(float const *__p)
   return ((struct __loadu_ps*)__p)->__v;
 }
 
+/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
+///    location pointed to by \a __p into elements of a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
+///    values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_load_si256(__m256i const *__p)
 {
   return *__p;
 }
 
+/// \brief Loads 256 bits of integer data from an unaligned memory location
+///    pointed to by \a __p into a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_loadu_si256(__m256i const *__p)
 {
@@ -2392,6 +3042,18 @@ _mm256_loadu_si256(__m256i const *__p)
   return ((struct __loadu_si256*)__p)->__v;
 }
 
+/// \brief Loads 256 bits of integer data from an unaligned memory location
+///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
+///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
+///    line boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_lddqu_si256(__m256i const *__p)
 {
@@ -2399,18 +3061,55 @@ _mm256_lddqu_si256(__m256i const *__p)
 }
 
 /* SIMD store ops */
+/// \brief Stores double-precision floating point values from a 256-bit vector
+///    of [4 x double] to a 32-byte aligned memory location pointed to by
+///    \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    double-precision floaing point values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_pd(double *__p, __m256d __a)
 {
   *(__m256d *)__p = __a;
 }
 
+/// \brief Stores single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_ps(float *__p, __m256 __a)
 {
   *(__m256 *)__p = __a;
 }
 
+/// \brief Stores double-precision floating point values from a 256-bit vector
+///    of [4 x double] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the double-precision
+///    floating point values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_pd(double *__p, __m256d __a)
 {
@@ -2420,6 +3119,17 @@ _mm256_storeu_pd(double *__p, __m256d __a)
   ((struct __storeu_pd*)__p)->__v = __a;
 }
 
+/// \brief Stores single-precision floating point values from a 256-bit vector
+///    of [8 x float] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_ps(float *__p, __m256 __a)
 {
@@ -2429,12 +3139,35 @@ _mm256_storeu_ps(float *__p, __m256 __a)
   ((struct __storeu_ps*)__p)->__v = __a;
 }
 
+/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
+///    aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_si256(__m256i *__p, __m256i __a)
 {
   *__p = __a;
 }
 
+/// \brief Stores integer values from a 256-bit integer vector to an unaligned
+///    memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_si256(__m256i *__p, __m256i __a)
 {
@@ -2445,12 +3178,48 @@ _mm256_storeu_si256(__m256i *__p, __m256i __a)
 }
 
 /* Conditional load ops */
+/// \brief Conditionally loads double-precision floating point elements from a
+///    memory location pointed to by \a __p into a 128-bit vector of
+///    [2 x double], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the double-precision
+///    floating point values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each data element represents the mask bits. If a mask bit is zero, the
+///    corresponding value in the memory location is not loaded and the
+///    corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm_maskload_pd(double const *__p, __m128i __m)
 {
   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
 }
 
+/// \brief Conditionally loads double-precision floating point elements from a
+///    memory location pointed to by \a __p into a 256-bit vector of
+///    [4 x double], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the double-precision
+///    floating point values.
+/// \param __m
+///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
+///    significant bit of each quadword element represents the mask bits. If a
+///    mask bit is zero, the corresponding value in the memory location is not
+///    loaded and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [4 x double] containing the loaded values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_maskload_pd(double const *__p, __m256i __m)
 {
@@ -2458,12 +3227,48 @@ _mm256_maskload_pd(double const *__p, __m256i __m)
                                                (__v4di)__m);
 }
 
+/// \brief Conditionally loads single-precision floating point elements from a
+///    memory location pointed to by \a __p into a 128-bit vector of
+///    [4 x float], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the single-precision
+///    floating point values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each data element represents the mask bits. If a mask bit is zero, the
+///    corresponding value in the memory location is not loaded and the
+///    corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_maskload_ps(float const *__p, __m128i __m)
 {
   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
 }
 
+/// \brief Conditionally loads single-precision floating point elements from a
+///    memory location pointed to by \a __p into a 256-bit vector of
+///    [8 x float], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the single-precision
+///    floating point values.
+/// \param __m
+///    A 256-bit integer vector of [8 x dword] containing the mask. The most
+///    significant bit of each dword element represents the mask bits. If a mask
+///    bit is zero, the corresponding value in the memory location is not loaded
+///    and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [8 x float] containing the loaded values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_maskload_ps(float const *__p, __m256i __m)
 {
@@ -2471,24 +3276,96 @@ _mm256_maskload_ps(float const *__p, __m256i __m)
 }
 
 /* Conditional store ops */
+/// \brief Moves single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a memory location pointed to by \a __p, according to
+///    the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 256-bit integer vector of [8 x dword] containing the mask. The most
+///    significant bit of each dword element in the mask vector represents the
+///    mask bits. If a mask bit is zero, the corresponding value from vector
+///    \a __a is not stored and the corresponding field in the memory location
+///    pointed to by \a __p is not changed.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
 {
   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
 }
 
+/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
+///    to a memory location pointed to by \a __p, according to the specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each field in the mask vector represents the mask bits. If a mask bit is
+///    zero, the corresponding value from vector \a __a is not stored and the
+///    corresponding field in the memory location pointed to by \a __p is not
+///    changed.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
 {
   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
 }
 
+/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
+///    to a memory location pointed to by \a __p, according to the specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
+///    significant bit of each quadword element in the mask vector represents
+///    the mask bits. If a mask bit is zero, the corresponding value from vector
+///    __a is not stored and the corresponding field in the memory location
+///    pointed to by \a __p is not changed.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
 {
   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
 }
 
+/// \brief Moves single-precision floating point values from a 128-bit vector
+///    of [4 x float] to a memory location pointed to by \a __p, according to
+///    the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each field in the mask vector represents the mask bits. If a mask bit is
+///    zero, the corresponding value from vector __a is not stored and the
+///    corresponding field in the memory location pointed to by \a __p is not
+///    changed.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 {
@@ -2496,18 +3373,58 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 }
 
 /* Cacheability support ops */
+/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    integer values.
+/// \param __b
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_si256(__m256i *__a, __m256i __b)
 {
   __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
 }
 
+/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
+///    to a 32-byte aligned memory location. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    integer values.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_pd(double *__a, __m256d __b)
 {
   __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
 }
 
+/// \brief Moves single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a 32-byte aligned memory location. To minimize
+///    caching, the data is flagged as non-temporal (unlikely to be used again
+///    soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    single-precision floating point values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_ps(float *__p, __m256 __a)
 {
@@ -2515,30 +3432,105 @@ _mm256_stream_ps(float *__p, __m256 __a)
 }
 
 /* Create vectors */
+/// \brief Create a 256-bit vector of [4 x double] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [4 x double] containing undefined values.
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_undefined_pd(void)
 {
   return (__m256d)__builtin_ia32_undef256();
 }
 
+/// \brief Create a 256-bit vector of [8 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [8 x float] containing undefined values.
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_undefined_ps(void)
 {
   return (__m256)__builtin_ia32_undef256();
 }
 
+/// \brief Create a 256-bit integer vector with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit integer vector containing undefined values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_undefined_si256(void)
 {
   return (__m256i)__builtin_ia32_undef256();
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [4 x double]
+///    initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A double-precision floating-point value used to initialize bits [255:192]
+///    of the result.
+/// \param __b
+///    A double-precision floating-point value used to initialize bits [191:128]
+///    of the result.
+/// \param __c
+///    A double-precision floating-point value used to initialize bits [127:64]
+///    of the result.
+/// \param __d
+///    A double-precision floating-point value used to initialize bits [63:0]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set_pd(double __a, double __b, double __c, double __d)
 {
   return (__m256d){ __d, __c, __b, __a };
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
+///    with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __a
+///    A single-precision floating-point value used to initialize bits [255:224]
+///    of the result.
+/// \param __b
+///    A single-precision floating-point value used to initialize bits [223:192]
+///    of the result.
+/// \param __c
+///    A single-precision floating-point value used to initialize bits [191:160]
+///    of the result.
+/// \param __d
+///    A single-precision floating-point value used to initialize bits [159:128]
+///    of the result.
+/// \param __e
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __f
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __g
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __h
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set_ps(float __a, float __b, float __c, float __d,
               float __e, float __f, float __g, float __h)
@@ -2546,6 +3538,31 @@ _mm256_set_ps(float __a, float __b, float __c, float __d,
   return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
 }
 
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i4
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i5
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i6
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i7
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
                  int __i4, int __i5, int __i6, int __i7)
@@ -2553,6 +3570,47 @@ _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
   return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
 }
 
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __w15
+///    A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \param __w14
+///    A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w13
+///    A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w12
+///    A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w11
+///    A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w10
+///    A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w09
+///    A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w08
+///    A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w07
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w06
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w05
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w04
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w03
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w02
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w01
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w00
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
                  short __w11, short __w10, short __w09, short __w08,
@@ -2563,6 +3621,79 @@ _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
 }
 
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __b31
+///    An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \param __b30
+///    An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b29
+///    An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b28
+///    An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b27
+///    An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b26
+///    An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b25
+///    An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b24
+///    An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b23
+///    An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b22
+///    An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b21
+///    An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b20
+///    An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b19
+///    An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b18
+///    An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b17
+///    An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b16
+///    An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b09
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b08
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b07
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b06
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b05
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b04
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b03
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b02
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b01
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b00
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
                 char __b27, char __b26, char __b25, char __b24,
@@ -2581,6 +3712,23 @@ _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
   };
 }
 
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \param __b
+///    A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __c
+///    A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __d
+///    A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
@@ -2588,12 +3736,68 @@ _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
 }
 
 /* Create vectors with elements in reverse order */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double],
+///    initialized in reverse order with the specified double-precision
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A double-precision floating-point value used to initialize bits [63:0]
+///    of the result.
+/// \param __b
+///    A double-precision floating-point value used to initialize bits [127:64]
+///    of the result.
+/// \param __c
+///    A double-precision floating-point value used to initialize bits [191:128]
+///    of the result.
+/// \param __d
+///    A double-precision floating-point value used to initialize bits [255:192]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setr_pd(double __a, double __b, double __c, double __d)
 {
   return (__m256d){ __a, __b, __c, __d };
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [8 x float],
+///    initialized in reverse order with the specified single-precision
+///    float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __a
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \param __b
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __c
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __d
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __e
+///    A single-precision floating-point value used to initialize bits [159:128]
+///    of the result.
+/// \param __f
+///    A single-precision floating-point value used to initialize bits [191:160]
+///    of the result.
+/// \param __g
+///    A single-precision floating-point value used to initialize bits [223:192]
+///    of the result.
+/// \param __h
+///    A single-precision floating-point value used to initialize bits [255:224]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setr_ps(float __a, float __b, float __c, float __d,
                float __e, float __f, float __g, float __h)
@@ -2601,6 +3805,31 @@ _mm256_setr_ps(float __a, float __b, float __c, float __d,
   return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
 }
 
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i4
+///    A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i5
+///    A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i6
+///    A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i7
+///    A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
                   int __i4, int __i5, int __i6, int __i7)
@@ -2608,6 +3837,47 @@ _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
   return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
 }
 
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __w15
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w14
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w13
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w12
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w11
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w10
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w09
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w08
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w07
+///    A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w06
+///    A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w05
+///    A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w04
+///    A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w03
+///    A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w02
+///    A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w01
+///    A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w00
+///    A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
        short __w11, short __w10, short __w09, short __w08,
@@ -2618,6 +3888,79 @@ _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
     __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
 }
 
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __b31
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b30
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b29
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b28
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b27
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b26
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b25
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b24
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b23
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b22
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b21
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b20
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b19
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b18
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b17
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b16
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b09
+///    An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b08
+///    An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b07
+///    An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b06
+///    An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b05
+///    An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b04
+///    An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b03
+///    An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b02
+///    An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b01
+///    An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b00
+///    An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
                  char __b27, char __b26, char __b25, char __b24,
@@ -2635,6 +3978,23 @@ _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
     __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
 }
 
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \param __b
+///    A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __c
+///    A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __d
+///    A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
@@ -2642,24 +4002,74 @@ _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
 }
 
 /* Create vectors with repeated elements */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
+///    of the four double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set1_pd(double __w)
 {
   return (__m256d){ __w, __w, __w, __w };
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
+///    of the eight single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set1_ps(float __w)
 {
   return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
+/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
+///    32-bit integral vector elements set to the specified 32-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __i
+///    A 32-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [8 x i32].
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi32(int __i)
 {
   return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
 }
 
+/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
+///    16-bit integral vector elements set to the specified 16-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __w
+///    A 16-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [16 x i16].
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi16(short __w)
 {
@@ -2667,6 +4077,17 @@ _mm256_set1_epi16(short __w)
     __w, __w, __w, __w, __w, __w };
 }
 
+/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
+///    8-bit integral vector elements set to the specified 8-bit integral value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __b
+///    An 8-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [32 x i8].
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi8(char __b)
 {
@@ -2675,6 +4096,18 @@ _mm256_set1_epi8(char __b)
     __b, __b, __b, __b, __b, __b, __b };
 }
 
+/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
+///    64-bit integral vector elements set to the specified 64-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __q
+///    A 64-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [4 x i64].
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi64x(long long __q)
 {
@@ -2682,18 +4115,41 @@ _mm256_set1_epi64x(long long __q)
 }
 
 /* Create __zeroed vectors */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
+///    vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setzero_pd(void)
 {
   return (__m256d){ 0, 0, 0, 0 };
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
+///    vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setzero_ps(void)
 {
   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
+/// \brief Constructs a 256-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit integer vector initialized to zero.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setzero_si256(void)
 {
@@ -2701,72 +4157,210 @@ _mm256_setzero_si256(void)
 }
 
 /* Cast between vector types */
+/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+///    floating-point vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castpd_ps(__m256d __a)
 {
   return (__m256)__a;
 }
 
+/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castpd_si256(__m256d __a)
 {
   return (__m256i)__a;
 }
 
+/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+///    floating-point vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castps_pd(__m256 __a)
 {
   return (__m256d)__a;
 }
 
+/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castps_si256(__m256 __a)
 {
   return (__m256i)__a;
 }
 
+/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
+///    of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castsi256_ps(__m256i __a)
 {
   return (__m256)__a;
 }
 
+/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
+///    of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castsi256_pd(__m256i __a)
 {
   return (__m256d)__a;
 }
 
+/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
+///    [4 x double] as a 128-bit floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the
+///    lower 128 bits of the parameter.
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm256_castpd256_pd128(__m256d __a)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
 }
 
+/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
+///    [8 x float] as a 128-bit floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the
+///    lower 128 bits of the parameter.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm256_castps256_ps128(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
 }
 
+/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 128-bit integer vector containing the lower 128 bits of the
+///    parameter.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_castsi256_si128(__m256i __a)
 {
   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The contents of the upper 128
+///    bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+///    contain the value of the parameter. The contents of the upper 128 bits
+///    are undefined.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castpd128_pd256(__m128d __a)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The contents of the upper 128 bits are
+///    undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+///    contain the value of the parameter. The contents of the upper 128 bits
+///    are undefined.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castps128_ps256(__m128 __a)
 {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
+/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The contents
+///    of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The contents of the upper 128 bits are undefined.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castsi128_si256(__m128i __a)
 {
@@ -2778,6 +4372,38 @@ _mm256_castsi128_si256(__m128i __a)
    We use macros rather than inlines because we only want to accept
    invocations where the immediate M is a constant expression.
 */
+/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
+///    a 256-bit vector of [8 x float] given in the first parameter, and then
+///    replacing either the upper or the lower 128 bits with the contents of a
+///    128-bit vector of [4 x float] in the second parameter. The immediate
+///    integer parameter determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float]. This vector is copied to the result
+///    first, and then either the upper or the lower 128 bits of the result will
+///    be replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit vector of [4 x float]. The contents of this parameter are
+///    written to either the upper or the lower 128 bits of the result depending
+///    on the value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_shufflevector( \
     (__v8sf)(__m256)(V1), \
@@ -2791,6 +4417,38 @@ _mm256_castsi128_si256(__m128i __a)
     (((M) & 1) ? 10 :  6), \
     (((M) & 1) ? 11 :  7) );})
 
+/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
+///    a 256-bit vector of [4 x double] given in the first parameter, and then
+///    replacing either the upper or the lower 128 bits with the contents of a
+///    128-bit vector of [2 x double] in the second parameter. The immediate
+///    integer parameter determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double]. This vector is copied to the result
+///    first, and then either the upper or the lower 128 bits of the result will
+///    be replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit vector of [2 x double]. The contents of this parameter are
+///    written to either the upper or the lower 128 bits of the result depending
+///    on the value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_shufflevector( \
     (__v4df)(__m256d)(V1), \
@@ -2800,6 +4458,38 @@ _mm256_castsi128_si256(__m128i __a)
     (((M) & 1) ? 4 : 2), \
     (((M) & 1) ? 5 : 3) );})
 
+/// \brief Constructs a new 256-bit integer vector by first duplicating a
+///    256-bit integer vector given in the first parameter, and then replacing
+///    either the upper or the lower 128 bits with the contents of a 128-bit
+///    integer vector in the second parameter. The immediate integer parameter
+///    determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit integer vector. This vector is copied to the result first, and
+///    then either the upper or the lower 128 bits of the result will be
+///    replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit integer vector. The contents of this parameter are written to
+///    either the upper or the lower 128 bits of the result depending on the
+///     value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit integer vector containing the interleaved values.
 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
   (__m256i)__builtin_shufflevector( \
     (__v4di)(__m256i)(V1), \
@@ -2814,6 +4504,27 @@ _mm256_castsi128_si256(__m128i __a)
    We use macros rather than inlines because we only want to accept
    invocations where the immediate M is a constant expression.
 */
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
+///    of [8 x float], as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter: \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
 #define _mm256_extractf128_ps(V, M) __extension__ ({ \
   (__m128)__builtin_shufflevector( \
     (__v8sf)(__m256)(V), \
@@ -2823,6 +4534,27 @@ _mm256_castsi128_si256(__m128i __a)
     (((M) & 1) ? 6 : 2), \
     (((M) & 1) ? 7 : 3) );})
 
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
+///    of [4 x double], as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter: \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
 #define _mm256_extractf128_pd(V, M) __extension__ ({ \
   (__m128d)__builtin_shufflevector( \
     (__v4df)(__m256d)(V), \
@@ -2830,6 +4562,27 @@ _mm256_castsi128_si256(__m128i __a)
     (((M) & 1) ? 2 : 0), \
     (((M) & 1) ? 3 : 1) );})
 
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
+///    integer vector, as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter:  \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit integer vector containing the extracted bits.
 #define _mm256_extractf128_si256(V, M) __extension__ ({ \
   (__m128i)__builtin_shufflevector( \
     (__v4di)(__m256i)(V), \
@@ -2838,6 +4591,27 @@ _mm256_castsi128_si256(__m128i __a)
     (((M) & 1) ? 3 : 1) );})
 
 /* SIMD load ops (unaligned) */
+/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [8 x float] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
 {
@@ -2845,6 +4619,27 @@ _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
   return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
 }
 
+/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [4 x double] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
 {
@@ -2852,6 +4647,24 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
   return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
 }
 
+/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
+///    constructs a 256-bit integer vector by concatenating the two 128-bit
+///    vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[255:128] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[127:0] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \returns A 256-bit integer vector containing the concatenated result.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
 {
@@ -2860,6 +4673,24 @@ _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
 }
 
 /* SIMD store ops (unaligned) */
+/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [8 x float] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
 {
@@ -2871,6 +4702,24 @@ _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
   _mm_storeu_ps(__addr_hi, __v128);
 }
 
+/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [4 x double] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
 {
@@ -2882,6 +4731,24 @@ _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
   _mm_storeu_pd(__addr_hi, __v128);
 }
 
+/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
+///    two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit integer vector.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
 {
@@ -2893,33 +4760,132 @@ _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
   _mm_storeu_si128(__addr_hi, __v128);
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
+///    concatenating two 128-bit floating-point vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
+///    128 bits of the result.
+/// \param __lo
+///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
 static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set_m128 (__m128 __hi, __m128 __lo) {
+_mm256_set_m128 (__m128 __hi, __m128 __lo)
+{
   return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
+///    concatenating two 128-bit floating-point vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
+///    128 bits of the result.
+/// \param __lo
+///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
 static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set_m128d (__m128d __hi, __m128d __lo) {
+_mm256_set_m128d (__m128d __hi, __m128d __lo)
+{
   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
 }
 
+/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
+///    integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit integer vector to be copied to the upper 128 bits of the
+///    result.
+/// \param __lo
+///    A 128-bit integer vector to be copied to the lower 128 bits of the
+///    result.
+/// \returns A 256-bit integer vector containing the concatenated result.
 static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_m128i (__m128i __hi, __m128i __lo) {
+_mm256_set_m128i (__m128i __hi, __m128i __lo)
+{
   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
+///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
+///    similar to _mm256_set_m128, but the order of the input parameters is
+///    swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
+///    128 bits of the result.
+/// \param __hi
+///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
 static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
+_mm256_setr_m128 (__m128 __lo, __m128 __hi)
+{
   return _mm256_set_m128(__hi, __lo);
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
+///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
+///    similar to _mm256_set_m128d, but the order of the input parameters is
+///    swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
+///    128 bits of the result.
+/// \param __hi
+///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
 static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
+_mm256_setr_m128d (__m128d __lo, __m128d __hi)
+{
   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
 }
 
+/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
+///    integer vectors. This is similar to _mm256_set_m128i, but the order of
+///    the input parameters is swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit integer vector to be copied to the lower 128 bits of the
+///    result.
+/// \param __hi
+///    A 128-bit integer vector to be copied to the upper 128 bits of the
+///    result.
+/// \returns A 256-bit integer vector containing the concatenated result.
 static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
+_mm256_setr_m128i (__m128i __lo, __m128i __hi)
+{
   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
 }
 
diff --git a/lib/Headers/bmiintrin.h b/lib/Headers/bmiintrin.h
index 30acfaeb9f3b..488eb2dbd3d4 100644
--- a/lib/Headers/bmiintrin.h
+++ b/lib/Headers/bmiintrin.h
@@ -36,7 +36,7 @@
 /// unsigned short _tzcnt_u16(unsigned short a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param a
 ///    An unsigned 16-bit integer whose trailing zeros are to be counted.
@@ -53,7 +53,7 @@
 /// unsigned int _andn_u32(unsigned int a, unsigned int b);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
 ///
 /// \param a
 ///    An unsigned integer containing one of the operands.
@@ -73,7 +73,7 @@
 /// unsigned int _blsi_u32(unsigned int a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
 ///
 /// \param a
 ///    An unsigned integer whose bits are to be cleared.
@@ -91,7 +91,7 @@
 /// unsigned int _blsmsk_u32(unsigned int a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
 ///
 /// \param a
 ///    An unsigned integer used to create the mask.
@@ -107,7 +107,7 @@
 /// unsigned int _blsr_u32(unsigned int a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
 ///
 /// \param a
 ///    An unsigned integer containing the operand to be cleared.
@@ -123,7 +123,7 @@
 /// unsigned int _tzcnt_u32(unsigned int a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param a
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
@@ -143,7 +143,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 16-bit integer whose trailing zeros are to be counted.
@@ -160,7 +160,7 @@ __tzcnt_u16(unsigned short __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing one of the operands.
@@ -180,7 +180,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@@ -202,7 +202,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@@ -225,7 +225,7 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be cleared.
@@ -243,7 +243,7 @@ __blsi_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer used to create the mask.
@@ -259,7 +259,7 @@ __blsmsk_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing the operand to be cleared.
@@ -275,7 +275,7 @@ __blsr_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
@@ -291,12 +291,12 @@ __tzcnt_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero
-///    bits in the operand.
+/// \returns An 32-bit integer containing the number of trailing zero bits in
+///    the operand.
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
@@ -314,7 +314,7 @@ _mm_tzcnt_32(unsigned int __X)
 /// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
 ///
 /// \param a
 ///    An unsigned 64-bit integer containing one of the operands.
@@ -334,7 +334,7 @@ _mm_tzcnt_32(unsigned int __X)
 /// unsigned long long _blsi_u64(unsigned long long a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
 ///
 /// \param a
 ///    An unsigned 64-bit integer whose bits are to be cleared.
@@ -352,7 +352,7 @@ _mm_tzcnt_32(unsigned int __X)
 /// unsigned long long _blsmsk_u64(unsigned long long a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
 ///
 /// \param a
 ///    An unsigned 64-bit integer used to create the mask.
@@ -368,7 +368,7 @@ _mm_tzcnt_32(unsigned int __X)
 /// unsigned long long _blsr_u64(unsigned long long a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
 ///
 /// \param a
 ///    An unsigned 64-bit integer containing the operand to be cleared.
@@ -384,7 +384,7 @@ _mm_tzcnt_32(unsigned int __X)
 /// unsigned long long _tzcnt_u64(unsigned long long a);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param a
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
@@ -397,7 +397,7 @@ _mm_tzcnt_32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing one of the operands.
@@ -417,7 +417,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@@ -439,7 +439,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@@ -462,7 +462,7 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be cleared.
@@ -480,7 +480,7 @@ __blsi_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer used to create the mask.
@@ -496,7 +496,7 @@ __blsmsk_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing the operand to be cleared.
@@ -512,7 +512,7 @@ __blsr_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
@@ -528,12 +528,12 @@ __tzcnt_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An 64-bit integer containing the number of trailing zero
-///    bits in the operand.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+///    the operand.
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
diff --git a/lib/Headers/cuda_wrappers/algorithm b/lib/Headers/cuda_wrappers/algorithm
new file mode 100644
index 000000000000..95d9beb73c68
--- /dev/null
+++ b/lib/Headers/cuda_wrappers/algorithm
@@ -0,0 +1,96 @@
+/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
+#define __CLANG_CUDA_WRAPPERS_ALGORITHM
+
+// This header defines __device__ overloads of std::min/max, but only if we're
+// <= C++11.  In C++14, these functions are constexpr, and so are implicitly
+// __host__ __device__.
+//
+// We don't support the initializer_list overloads because
+// initializer_list::begin() and end() are not __host__ __device__ functions.
+//
+// When compiling in C++14 mode, we could force std::min/max to have different
+// implementations for host and device, by declaring the device overloads
+// before the constexpr overloads appear.  We choose not to do this because
+
+//  a) why write our own implementation when we can use one from the standard
+//     library? and
+//  b) libstdc++ is evil and declares min/max inside a header that is included
+//     *before* we include <algorithm>.  So we'd have to unconditionally
+//     declare our __device__ overloads of min/max, but that would pollute
+//     things for people who choose not to include <algorithm>.
+
+#include_next <algorithm>
+
+#if __cplusplus <= 201103L
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__a, __b) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b) {
+  return __a < __b ? __b : __a;
+}
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__b, __a) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b) {
+  return __a < __b ? __b : __a;
+}
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif // __cplusplus <= 201103L
+#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
diff --git a/lib/Headers/cuda_wrappers/complex b/lib/Headers/cuda_wrappers/complex
new file mode 100644
index 000000000000..11d40a82a8f6
--- /dev/null
+++ b/lib/Headers/cuda_wrappers/complex
@@ -0,0 +1,82 @@
+/*===---- complex - CUDA wrapper for <complex> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
+#define __CLANG_CUDA_WRAPPERS_COMPLEX
+
+// Wrapper around <complex> that forces its functions to be __host__
+// __device__.
+
+// First, include host-only headers we think are likely to be included by
+// <complex>, so that the pragma below only applies to <complex> itself.
+#if __cplusplus >= 201103L
+#include <type_traits>
+#endif
+#include <stdexcept>
+#include <cmath>
+#include <sstream>
+
+// Next, include our <algorithm> wrapper, to ensure that device overloads of
+// std::min/max are available.
+#include <algorithm>
+
+#pragma clang force_cuda_host_device begin
+
+// When compiling for device, ask libstdc++ to use its own implements of
+// complex functions, rather than calling builtins (which resolve to library
+// functions that don't exist when compiling CUDA device code).
+//
+// This is a little dicey, because it causes libstdc++ to define a different
+// set of overloads on host and device.
+//
+//   // Present only when compiling for host.
+//   __host__ __device__ void complex<float> sin(const complex<float>& x) {
+//     return __builtin_csinf(x);
+//   }
+//
+//   // Present when compiling for host and for device.
+//   template <typename T>
+//   void __host__ __device__ complex<T> sin(const complex<T>& x) {
+//     return complex<T>(sin(x.real()) * cosh(x.imag()),
+//                       cos(x.real()), sinh(x.imag()));
+//   }
+//
+// This is safe because when compiling for device, all function calls in
+// __host__ code to sin() will still resolve to *something*, even if they don't
+// resolve to the same function as they resolve to when compiling for host.  We
+// don't care that they don't resolve to the right function because we won't
+// codegen this host code when compiling for device.
+
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#define _GLIBCXX_USE_C99_COMPLEX 0
+#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
+
+#include_next <complex>
+
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
+
+#pragma clang force_cuda_host_device end
+
+#endif // include guard
diff --git a/lib/Headers/cuda_wrappers/new b/lib/Headers/cuda_wrappers/new
new file mode 100644
index 000000000000..b77131af0e5b
--- /dev/null
+++ b/lib/Headers/cuda_wrappers/new
@@ -0,0 +1,47 @@
+/*===---- complex - CUDA wrapper for <new> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_NEW
+#define __CLANG_CUDA_WRAPPERS_NEW
+
+#include_next <new>
+
+// Device overrides for placement new and delete.
+#pragma push_macro("CUDA_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define CUDA_NOEXCEPT noexcept
+#else
+#define CUDA_NOEXCEPT
+#endif
+
+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+#pragma pop_macro("CUDA_NOEXCEPT")
+
+#endif // include guard
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index 70d6d726110a..1512f9f0b47b 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h
@@ -49,6 +49,21 @@ typedef signed char __v16qs __attribute__((__vector_size__(16)));
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
 
+/// \brief Adds lower double-precision values in both operands and returns the
+///    sum in the lower 64 bits of the result. The upper 64 bits of the result
+///    are copied from the upper double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
+///    from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_sd(__m128d __a, __m128d __b)
 {
@@ -56,12 +71,41 @@ _mm_add_sd(__m128d __a, __m128d __b)
   return __a;
 }
 
+/// \brief Adds two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the sums of both
+///    operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v2df)__a + (__v2df)__b);
 }
 
+/// \brief Subtracts the lower double-precision value of the second operand
+///    from the lower double-precision value of the first operand and returns
+///    the difference in the lower 64 bits of the result. The upper 64 bits of
+///    the result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    difference of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_sd(__m128d __a, __m128d __b)
 {
@@ -69,12 +113,40 @@ _mm_sub_sd(__m128d __a, __m128d __b)
   return __a;
 }
 
+/// \brief Subtracts two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the differences between
+///    both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v2df)__a - (__v2df)__b);
 }
 
+/// \brief Multiplies lower double-precision values in both operands and returns
+///    the product in the lower 64 bits of the result. The upper 64 bits of the
+///    result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    product of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_sd(__m128d __a, __m128d __b)
 {
@@ -82,12 +154,41 @@ _mm_mul_sd(__m128d __a, __m128d __b)
   return __a;
 }
 
+/// \brief Multiplies two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the products of both
+///    operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v2df)__a * (__v2df)__b);
 }
 
+/// \brief Divides the lower double-precision value of the first operand by the
+///    lower double-precision value of the second operand and returns the
+///    quotient in the lower 64 bits of the result. The upper 64 bits of the
+///    result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing divisor.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    quotient of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_sd(__m128d __a, __m128d __b)
 {
@@ -95,12 +196,44 @@ _mm_div_sd(__m128d __a, __m128d __b)
   return __a;
 }
 
+/// \brief Performs an element-by-element division of two 128-bit vectors of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the divisor.
+/// \returns A 128-bit vector of [2 x double] containing the quotients of both
+///    operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v2df)__a / (__v2df)__b);
 }
 
+/// \brief Calculates the square root of the lower double-precision value of
+///    the second operand and returns it in the lower 64 bits of the result.
+///    The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    upper 64 bits of this operand are copied to the upper 64 bits of the
+///    result.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    square root is calculated using the lower 64 bits of this operand.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    square root of the lower 64 bits of operand \a __b, and whose upper 64
+///    bits are copied from the upper 64 bits of operand \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_sd(__m128d __a, __m128d __b)
 {
@@ -108,150 +241,518 @@ _mm_sqrt_sd(__m128d __a, __m128d __b)
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Calculates the square root of the each of two values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the square roots of the
+///    values in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_pd(__m128d __a)
 {
   return __builtin_ia32_sqrtpd((__v2df)__a);
 }
 
+/// \brief Compares lower 64-bit double-precision values of both operands, and
+///    returns the lesser of the pair of values in the lower 64-bits of the
+///    result. The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    minimum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Performs element-by-element comparison of the two 128-bit vectors of
+///    [2 x double] and returns the vector containing the lesser of each pair of
+///    values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the minimum values
+///    between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares lower 64-bits double-precision values of both operands, and
+///    returns the greater of the pair of values in the lower 64-bits of the
+///    result. The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    maximum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Performs element-by-element comparison of the two 128-bit vectors of
+///    [2 x double] and returns the vector containing the greater of each pair
+///    of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the maximum values
+///    between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_and_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4su)__a & (__v4su)__b);
+  return (__m128d)((__v2du)__a & (__v2du)__b);
 }
 
+/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values in the second operand and the one's complement of the first
+///    operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_andnot_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)(~(__v4su)__a & (__v4su)__b);
+  return (__m128d)(~(__v2du)__a & (__v2du)__b);
 }
 
+/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_or_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4su)__a | (__v4su)__b);
+  return (__m128d)((__v2du)__a | (__v2du)__b);
 }
 
+/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_xor_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4su)__a ^ (__v4su)__b);
+  return (__m128d)((__v2du)__a ^ (__v2du)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] for equality. Each comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are less than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are less than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are greater than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are greater than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are ordered with respect to those in the second operand. A pair
+///    of double-precision values are "ordered" with respect to each other if
+///    neither value is a NaN. Each comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are unordered with respect to those in the second operand. A pair
+///    of double-precision values are "unordered" with respect to each other if
+///    one or both values are NaN. Each comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are unequal to those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not less than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not less than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not greater than those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
 }
 
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not greater than or equal to those in the second operand.
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_sd(__m128d __a, __m128d __b)
 {
@@ -259,6 +760,24 @@ _mm_cmpgt_sd(__m128d __a, __m128d __b)
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_sd(__m128d __a, __m128d __b)
 {
@@ -266,36 +785,147 @@ _mm_cmpge_sd(__m128d __a, __m128d __b)
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is "ordered" with respect to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
+///    "ordered" with respect to each other if neither value is a NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is "unordered" with respect to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
+///    are "unordered" with respect to each other if one or both values are NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not less than the corresponding
+///    value in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not greater than the corresponding
+///    value in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_sd(__m128d __a, __m128d __b)
 {
@@ -303,6 +933,24 @@ _mm_cmpngt_sd(__m128d __a, __m128d __b)
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_sd(__m128d __a, __m128d __b)
 {
@@ -310,84 +958,317 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true. If
+///    either of the two lower double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true. If either of the two lower double-precision values is
+///    NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0 for false, 1 for true.
+///    If either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.  If either of the two lower double-precision values
+///    is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true. If
+///    either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison result. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two single-precision floating-point
+///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
+///    The upper 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpd_ps(__m128d __a)
 {
   return __builtin_ia32_cvtpd2ps((__v2df)__a);
 }
 
+/// \brief Converts the lower two single-precision floating-point elements of a
+///    128-bit vector of [4 x float] into two double-precision floating-point
+///    values, returned in a 128-bit vector of [2 x double]. The upper two
+///    elements of the input vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower two single-precision
+///    floating-point elements are converted to double-precision values. The
+///    upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtps_pd(__m128 __a)
 {
@@ -395,6 +1276,19 @@ _mm_cvtps_pd(__m128 __a)
       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
 }
 
+/// \brief Converts the lower two integer elements of a 128-bit vector of
+///    [4 x i32] into two double-precision floating-point values, returned in a
+///    128-bit vector of [2 x double]. The upper two elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
+///    converted to double-precision values. The upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtepi32_pd(__m128i __a)
 {
@@ -402,24 +1296,84 @@ _mm_cvtepi32_pd(__m128i __a)
       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
 }
 
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
+///    64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtpd_epi32(__m128d __a)
 {
   return __builtin_ia32_cvtpd2dq((__v2df)__a);
 }
 
+/// \brief Converts the low-order element of a 128-bit vector of [2 x double]
+///    into a 32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsd_si32(__m128d __a)
 {
   return __builtin_ia32_cvtsd2si((__v2df)__a);
 }
 
+/// \brief Converts the lower double-precision floating-point element of a
+///    128-bit vector of [2 x double], in the second parameter, into a
+///    single-precision floating-point value, returned in the lower 32 bits of a
+///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
+///    copied from the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
+///    copied to the upper 96 bits of the result.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision
+///    floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
+///    converted value from the second parameter. The upper 96 bits are copied
+///    from the upper 96 bits of the first parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsd_ss(__m128 __a, __m128d __b)
 {
   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
 }
 
+/// \brief Converts a 32-bit signed integer value, in the second parameter, into
+///    a double-precision floating-point value, returned in the lower 64 bits of
+///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+///    are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+///    copied to the upper 64 bits of the result.
+/// \param __b
+///    A 32-bit signed integer containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+///    converted value from the second parameter. The upper 64 bits are copied
+///    from the upper 64 bits of the first parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtsi32_sd(__m128d __a, int __b)
 {
@@ -427,6 +1381,25 @@ _mm_cvtsi32_sd(__m128d __a, int __b)
   return __a;
 }
 
+/// \brief Converts the lower single-precision floating-point element of a
+///    128-bit vector of [4 x float], in the second parameter, into a
+///    double-precision floating-point value, returned in the lower 64 bits of
+///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+///    are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+///    copied to the upper 64 bits of the result.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower single-precision
+///    floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+///    converted value from the second parameter. The upper 64 bits are copied
+///    from the upper 64 bits of the first parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtss_sd(__m128d __a, __m128 __b)
 {
@@ -434,48 +1407,145 @@ _mm_cvtss_sd(__m128d __a, __m128 __b)
   return __a;
 }
 
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
+///    result of either conversion is inexact, the result is truncated (rounded
+///    towards zero) regardless of the current MXCSR setting. The upper 64 bits
+///    of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttpd_epi32(__m128d __a)
 {
   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
 }
 
+/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
+///    signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttsd_si32(__m128d __a)
 {
   return __builtin_ia32_cvttsd2si((__v2df)__a);
 }
 
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtpd_pi32(__m128d __a)
 {
   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
 }
 
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in a 64-bit vector of [2 x i32]. If the result of either
+///    conversion is inexact, the result is truncated (rounded towards zero)
+///    regardless of the current MXCSR setting.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttpd_pi32(__m128d __a)
 {
   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
 }
 
+/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
+///    [2 x i32] into two double-precision floating-point values, returned in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtpi32_pd(__m64 __a)
 {
   return __builtin_ia32_cvtpi2pd((__v2si)__a);
 }
 
+/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
+///    a double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
+/// \returns A double-precision floating-point value copied from the lower 64
+///    bits of \a __a.
 static __inline__ double __DEFAULT_FN_ATTRS
 _mm_cvtsd_f64(__m128d __a)
 {
   return __a[0];
 }
 
+/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_pd(double const *__dp)
 {
   return *(__m128d*)__dp;
 }
 
+/// \brief Loads a double-precision floating-point value from a specified memory
+///    location and duplicates it to both vector elements of a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location containing a double-precision value.
+/// \returns A 128-bit vector of [2 x double] containing the loaded and
+///    duplicated values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load1_pd(double const *__dp)
 {
@@ -488,6 +1558,20 @@ _mm_load1_pd(double const *__dp)
 
 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
 
+/// \brief Loads two double-precision values, in reverse order, from an aligned
+///    memory location into a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
+/// needed shuffling instructions. In AVX mode, the shuffling may be combined
+/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
+///
+/// \param __dp
+///    A 16-byte aligned pointer to an array of double-precision values to be
+///    loaded in reverse order.
+/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
+///    values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadr_pd(double const *__dp)
 {
@@ -495,6 +1579,17 @@ _mm_loadr_pd(double const *__dp)
   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
 }
 
+/// \brief Loads a 128-bit floating-point vector of [2 x double] from an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadu_pd(double const *__dp)
 {
@@ -524,6 +1619,23 @@ _mm_load_sd(double const *__dp)
   return (__m128d){ __u, 0 };
 }
 
+/// \brief Loads a double-precision value into the high-order bits of a 128-bit
+///    vector of [2 x double]. The low-order bits are copied from the low-order
+///    bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the result.
+/// \param __dp
+///    A pointer to a 64-bit memory location containing a double-precision
+///    floating-point value that is loaded. The loaded value is written to bits
+///    [127:64] of the result. The address of the memory location does not have
+///    to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadh_pd(__m128d __a, double const *__dp)
 {
@@ -534,6 +1646,23 @@ _mm_loadh_pd(__m128d __a, double const *__dp)
   return (__m128d){ __a[0], __u };
 }
 
+/// \brief Loads a double-precision value into the low-order bits of a 128-bit
+///    vector of [2 x double]. The high-order bits are copied from the
+///    high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the result.
+/// \param __dp
+///    A pointer to a 64-bit memory location containing a double-precision
+///    floating-point value that is loaded. The loaded value is written to bits
+///    [63:0] of the result. The address of the memory location does not have to
+///    be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadl_pd(__m128d __a, double const *__dp)
 {
@@ -544,48 +1673,149 @@ _mm_loadl_pd(__m128d __a, double const *__dp)
   return (__m128d){ __u, __a[1] };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] with
+///    unspecified content. This could be used as an argument to another
+///    intrinsic function where the argument is required but the value is not
+///    actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
+///    content.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_undefined_pd(void)
 {
   return (__m128d)__builtin_ia32_undef128();
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
+///    64 bits of the vector are initialized with the specified double-precision
+///    floating-point value. The upper 64 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
+///    lower 64 bits contain the value of the parameter. The upper 64 bits are
+///    set to zero.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set_sd(double __w)
 {
   return (__m128d){ __w, 0 };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
+///    of the two double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set1_pd(double __w)
 {
   return (__m128d){ __w, __w };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]
+///    initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the upper 64
+///    bits of the result.
+/// \param __x
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set_pd(double __w, double __x)
 {
   return (__m128d){ __x, __w };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [2 x double],
+///    initialized in reverse order with the specified double-precision
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \param __x
+///    A double-precision floating-point value used to initialize the upper 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_setr_pd(double __w, double __x)
 {
   return (__m128d){ __w, __x };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]
+///    initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [2 x double] with
+///    all elements set to zero.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_setzero_pd(void)
 {
   return (__m128d){ 0, 0 };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
+///    64 bits are set to the lower 64 bits of the second parameter. The upper
+///    64 bits are set to the upper 64 bits of the first parameter.
+//
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
+///    upper 64 bits of the result.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
+///    lower 64 bits of the result.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_move_sd(__m128d __a, __m128d __b)
 {
   return (__m128d){ __b[0], __a[1] };
 }
 
+/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_sd(double *__dp, __m128d __a)
 {
@@ -608,12 +1838,36 @@ _mm_store1_pd(double *__dp, __m128d __a)
   _mm_store_pd(__dp, __a);
 }
 
+/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_pd1(double *__dp, __m128d __a)
 {
   return _mm_store1_pd(__dp, __a);
 }
 
+/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_pd(double *__dp, __m128d __a)
 {
@@ -623,6 +1877,20 @@ _mm_storeu_pd(double *__dp, __m128d __a)
   ((struct __storeu_pd*)__dp)->__v = __a;
 }
 
+/// \brief Stores two double-precision values, in reverse order, from a 128-bit
+///    vector of [2 x double] to a 16-byte aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to a shuffling instruction followed by a
+/// <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 16-byte aligned memory location that can store two
+///    double-precision values.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be reversed and
+///    stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storer_pd(double *__dp, __m128d __a)
 {
@@ -630,6 +1898,17 @@ _mm_storer_pd(double *__dp, __m128d __a)
   *(__m128d *)__dp = __a;
 }
 
+/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeh_pd(double *__dp, __m128d __a)
 {
@@ -639,6 +1918,17 @@ _mm_storeh_pd(double *__dp, __m128d __a)
   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
 }
 
+/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_pd(double *__dp, __m128d __a)
 {
@@ -648,127 +1938,391 @@ _mm_storel_pd(double *__dp, __m128d __a)
   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
 }
 
+/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
+///    saving the lower 8 bits of each sum in the corresponding element of a
+///    128-bit result vector of [16 x i8]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the sums of both
+///    parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qu)__a + (__v16qu)__b);
 }
 
+/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
+///    saving the lower 16 bits of each sum in the corresponding element of a
+///    128-bit result vector of [8 x i16]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+/// \returns A 128-bit vector of [8 x i16] containing the sums of both
+///    parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hu)__a + (__v8hu)__b);
 }
 
+/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
+///    saving the lower 32 bits of each sum in the corresponding element of a
+///    128-bit result vector of [4 x i32]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the sums of both
+///    parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4su)__a + (__v4su)__b);
 }
 
+/// \brief Adds two signed or unsigned 64-bit integer values, returning the
+///    lower 64 bits of the sum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer.
+/// \param __b
+///    A 64-bit integer.
+/// \returns A 64-bit integer containing the sum of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_si64(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
 }
 
+/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
+///    saving the lower 64 bits of each sum in the corresponding element of a
+///    128-bit result vector of [2 x i64]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+/// \returns A 128-bit vector of [2 x i64] containing the sums of both
+///    parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v2du)__a + (__v2du)__b);
 }
 
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    signed [16 x i8] vectors, saving each sum in the corresponding element of
+///    a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
+///    saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [16 x i8] vector.
+/// \param __b
+///    A 128-bit signed [16 x i8] vector.
+/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
+///    both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    signed [8 x i16] vectors, saving each sum in the corresponding element of
+///    a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
+///    are saturated to 7FFFh. Negative sums less than 8000h are saturated to
+///    8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
+///    both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
+///    are saturated to FFh. Negative sums are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
+///    of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
+///    are saturated to FFFFh. Negative sums are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
+///    of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Computes the rounded avarages of corresponding elements of two
+///    128-bit unsigned [16 x i8] vectors, saving each result in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
+///    averages of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Computes the rounded avarages of corresponding elements of two
+///    128-bit unsigned [8 x i16] vectors, saving each result in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
+///    averages of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, producing eight intermediate 32-bit signed integer products, and
+///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
+///    [4 x i32] vector. For example, bits [15:0] of both parameters are
+///    multiplied producing a 32-bit product, bits [31:16] of both parameters
+///    are multiplied producing a 32-bit product, and the sum of those two
+///    products becomes bits [31:0] of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
+///    of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_madd_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, saving the greater value from each comparison in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
+///    each comparison.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
+///    vectors, saving the greater value from each comparison in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
+///    each comparison.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, saving the smaller value from each comparison in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
+///    each comparison.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
+///    vectors, saving the smaller value from each comparison in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
+///    each comparison.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Multiplies the corresponding elements of two signed [8 x i16]
+///    vectors, saving the upper 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
+///    each of the eight 32-bit products.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
+///    vectors, saving the upper 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
+///    of each of the eight 32-bit products.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhi_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Multiplies the corresponding elements of two [8 x short] vectors and
-///    returns a vector containing the low-order 16 bits of each 32-bit product
-///    in the corresponding element.
+/// \brief Multiplies the corresponding elements of two signed [8 x i16]
+///    vectors, saving the lower 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit signed [8 x i16] result vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
+/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
 ///
 /// \param __a
-///    A 128-bit integer vector containing one of the source operands.
+///    A 128-bit signed [8 x i16] vector.
 /// \param __b
-///    A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the products of both operands.
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
+///    each of the eight 32-bit products.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi16(__m128i __a, __m128i __b)
 {
@@ -781,7 +2335,7 @@ _mm_mullo_epi16(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMULUDQ instruction.
+/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer containing one of the source operands.
@@ -800,7 +2354,7 @@ _mm_mul_su32(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
+/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
 ///
 /// \param __a
 ///    A [2 x i64] vector containing one of the source operands.
@@ -821,7 +2375,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
+/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing one of the source operands.
@@ -839,7 +2393,7 @@ _mm_sad_epu8(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
+/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -857,7 +2411,7 @@ _mm_sub_epi8(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
+/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -875,7 +2429,7 @@ _mm_sub_epi16(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
+/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -894,7 +2448,7 @@ _mm_sub_epi32(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBQ instruction.
+/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing the minuend.
@@ -912,7 +2466,7 @@ _mm_sub_si64(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
+/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -933,7 +2487,7 @@ _mm_sub_epi64(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
+/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -954,7 +2508,7 @@ _mm_subs_epi8(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
+/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -974,7 +2528,7 @@ _mm_subs_epi16(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
+/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -994,7 +2548,7 @@ _mm_subs_epu8(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
+/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -1012,7 +2566,7 @@ _mm_subs_epu16(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPAND / PAND instruction.
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing one of the source operands.
@@ -1031,7 +2585,7 @@ _mm_and_si128(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector containing the left source operand. The one's complement
@@ -1049,7 +2603,7 @@ _mm_andnot_si128(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPOR / POR instruction.
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing one of the source operands.
@@ -1067,7 +2621,7 @@ _mm_or_si128(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing one of the source operands.
@@ -1090,13 +2644,13 @@ _mm_xor_si128(__m128i __a, __m128i __b)
 /// __m128i _mm_slli_si128(__m128i a, const int imm);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
 ///
 /// \param a
 ///    A 128-bit integer vector containing the source operand.
 /// \param imm
-///    An immediate value specifying the number of bytes to left-shift
-///    operand a.
+///    An immediate value specifying the number of bytes to left-shift operand
+///    \a a.
 /// \returns A 128-bit integer vector containing the left-shifted value.
 #define _mm_slli_si128(a, imm) __extension__ ({                              \
   (__m128i)__builtin_shufflevector(                                          \
@@ -1127,13 +2681,13 @@ _mm_xor_si128(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to left-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi16(__m128i __a, int __count)
@@ -1146,13 +2700,13 @@ _mm_slli_epi16(__m128i __a, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand __a.
+///    to left-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi16(__m128i __a, __m128i __count)
@@ -1165,13 +2719,13 @@ _mm_sll_epi16(__m128i __a, __m128i __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to left-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi32(__m128i __a, int __count)
@@ -1184,13 +2738,13 @@ _mm_slli_epi32(__m128i __a, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand __a.
+///    to left-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi32(__m128i __a, __m128i __count)
@@ -1203,13 +2757,13 @@ _mm_sll_epi32(__m128i __a, __m128i __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to left-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi64(__m128i __a, int __count)
@@ -1222,13 +2776,13 @@ _mm_slli_epi64(__m128i __a, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand __a.
+///    to left-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi64(__m128i __a, __m128i __count)
@@ -1242,13 +2796,13 @@ _mm_sll_epi64(__m128i __a, __m128i __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi16(__m128i __a, int __count)
@@ -1262,13 +2816,13 @@ _mm_srai_epi16(__m128i __a, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi16(__m128i __a, __m128i __count)
@@ -1282,13 +2836,13 @@ _mm_sra_epi16(__m128i __a, __m128i __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi32(__m128i __a, int __count)
@@ -1302,13 +2856,13 @@ _mm_srai_epi32(__m128i __a, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi32(__m128i __a, __m128i __count)
@@ -1325,13 +2879,13 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
 /// __m128i _mm_srli_si128(__m128i a, const int imm);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
 ///
 /// \param a
 ///    A 128-bit integer vector containing the source operand.
 /// \param imm
 ///    An immediate value specifying the number of bytes to right-shift operand
-///    a.
+///    \a a.
 /// \returns A 128-bit integer vector containing the right-shifted value.
 #define _mm_srli_si128(a, imm) __extension__ ({                              \
   (__m128i)__builtin_shufflevector(                                          \
@@ -1362,13 +2916,13 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi16(__m128i __a, int __count)
@@ -1381,13 +2935,13 @@ _mm_srli_epi16(__m128i __a, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi16(__m128i __a, __m128i __count)
@@ -1400,13 +2954,13 @@ _mm_srl_epi16(__m128i __a, __m128i __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi32(__m128i __a, int __count)
@@ -1419,13 +2973,13 @@ _mm_srli_epi32(__m128i __a, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi32(__m128i __a, __m128i __count)
@@ -1438,13 +2992,13 @@ _mm_srl_epi32(__m128i __a, __m128i __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi64(__m128i __a, int __count)
@@ -1457,13 +3011,13 @@ _mm_srli_epi64(__m128i __a, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi64(__m128i __a, __m128i __count)
@@ -1477,7 +3031,7 @@ _mm_srl_epi64(__m128i __a, __m128i __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1496,7 +3050,7 @@ _mm_cmpeq_epi8(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1515,7 +3069,7 @@ _mm_cmpeq_epi16(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1535,7 +3089,7 @@ _mm_cmpeq_epi32(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1557,7 +3111,7 @@ _mm_cmpgt_epi8(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1577,7 +3131,7 @@ _mm_cmpgt_epi16(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1597,7 +3151,7 @@ _mm_cmpgt_epi32(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1617,7 +3171,7 @@ _mm_cmplt_epi8(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1637,7 +3191,7 @@ _mm_cmplt_epi16(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1658,7 +3212,7 @@ _mm_cmplt_epi32(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
@@ -1680,7 +3234,7 @@ _mm_cvtsi64_sd(__m128d __a, long long __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
@@ -1697,7 +3251,8 @@ _mm_cvtsd_si64(__m128d __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
@@ -1714,7 +3269,7 @@ _mm_cvttsd_si64(__m128d __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1729,7 +3284,7 @@ _mm_cvtepi32_ps(__m128i __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1746,7 +3301,8 @@ _mm_cvtps_epi32(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1762,7 +3318,7 @@ _mm_cvttps_epi32(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
 ///
 /// \param __a
 ///    A 32-bit signed integer operand.
@@ -1779,7 +3335,7 @@ _mm_cvtsi32_si128(int __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit signed integer operand containing the value to be converted.
@@ -1796,7 +3352,7 @@ _mm_cvtsi64_si128(long long __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
 ///
 /// \param __a
 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
@@ -1815,7 +3371,7 @@ _mm_cvtsi128_si32(__m128i __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
 ///
 /// \param __a
 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
@@ -1833,7 +3389,7 @@ _mm_cvtsi128_si64(__m128i __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
+/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
 ///
 /// \param __p
 ///    An aligned pointer to a memory location containing integer values.
@@ -1849,7 +3405,7 @@ _mm_load_si128(__m128i const *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
+/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a memory location containing integer values.
@@ -1868,7 +3424,7 @@ _mm_loadu_si128(__m128i const *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
 ///
 /// \param __p
 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
@@ -2154,42 +3710,170 @@ _mm_set1_epi8(char __b)
   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+///   instruction.
+///
+/// \param __q0
+///    A 64-bit integral value used to initialize the lower 64 bits of the
+///    result.
+/// \param __q1
+///    A 64-bit integral value used to initialize the upper 64 bits of the
+///    result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi64(__m64 __q0, __m64 __q1)
 {
   return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
 {
   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w4
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w5
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w6
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w7
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
 {
   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b8
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b9
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
 {
   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
+/// \brief Creates a 128-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit integer vector with all elements set to
+///    zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setzero_si128(void)
 {
   return (__m128i){ 0LL, 0LL };
 }
 
+/// \brief Stores a 128-bit integer vector to a memory location aligned on a
+///    128-bit boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that will receive the integer
+///    values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_si128(__m128i *__p, __m128i __b)
 {
   *__p = __b;
 }
 
+/// \brief Stores a 128-bit integer vector to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_si128(__m128i *__p, __m128i __b)
 {
@@ -2199,12 +3883,45 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
   ((struct __storeu_si128*)__p)->__v = __b;
 }
 
+/// \brief Moves bytes selected by the mask from the first operand to the
+///    specified unaligned memory location. When a mask bit is 1, the
+///    corresponding byte is written, otherwise it is not written. To minimize
+///    caching, the date is flagged as non-temporal (unlikely to be used again
+///    soon). Exception and trap behavior for elements not selected for storage
+///    to memory are implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
+///   instruction.
+///
+/// \param __d
+///    A 128-bit integer vector containing the values to be moved.
+/// \param __n
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each byte represents the mask bits.
+/// \param __p
+///    A pointer to an unaligned 128-bit memory location where the specified
+///    values are moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
 {
   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
 }
 
+/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
+///    a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the lower 64 bits
+///    of the integer vector parameter.
+/// \param __a
+///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
+///    value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_epi64(__m128i *__p, __m128i __a)
 {
@@ -2214,18 +3931,54 @@ _mm_storel_epi64(__m128i *__p, __m128i __a)
   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
 }
 
+/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A vector of [2 x double] containing the 64-bit values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pd(double *__p, __m128d __a)
 {
   __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
 }
 
+/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A 128-bit integer vector containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si128(__m128i *__p, __m128i __a)
 {
   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
 }
 
+/// \brief Stores a 32-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
+///
+/// \param __p
+///    A pointer to the 32-bit memory location used to store the value.
+/// \param __a
+///    A 32-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si32(int *__p, int __a)
 {
@@ -2233,6 +3986,18 @@ _mm_stream_si32(int *__p, int __a)
 }
 
 #ifdef __x86_64__
+/// \brief Stores a 64-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
+///
+/// \param __p
+///    A pointer to the 64-bit memory location used to store the value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si64(long long *__p, long long __a)
 {
@@ -2240,42 +4005,154 @@ _mm_stream_si64(long long *__p, long long __a)
 }
 #endif
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflush(void const *__p)
-{
-  __builtin_ia32_clflush(__p);
-}
+#if defined(__cplusplus)
+extern "C" {
+#endif
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_lfence(void)
-{
-  __builtin_ia32_lfence();
-}
+/// \brief The cache line containing \a __p is flushed and invalidated from all
+///    caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    flushed.
+void _mm_clflush(void const *);
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_mfence(void)
-{
-  __builtin_ia32_mfence();
-}
+/// \brief Forces strong memory ordering (serialization) between load
+///    instructions preceding this instruction and load instructions following
+///    this instruction, ensuring the system completes all previous loads before
+///    executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
+///
+void _mm_lfence(void);
 
+/// \brief Forces strong memory ordering (serialization) between load and store
+///    instructions preceding this instruction and load and store instructions
+///    following this instruction, ensuring that the system completes all
+///    previous memory accesses before executing subsequent memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
+///
+void _mm_mfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
+///
+/// \param __a
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the lower 64 bits of the result.
+/// \param __b
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit unsigned integers, and packs the results into the
+///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
+///    the immediate-value parameter as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __imm
+///    An immediate value. Bits [3:0] selects values from \a __a to be assigned
+///    to bits[15:0] of the result. \n
+///    000: assign values from bits [15:0] of \a __a. \n
+///    001: assign values from bits [31:16] of \a __a. \n
+///    010: assign values from bits [47:32] of \a __a. \n
+///    011: assign values from bits [63:48] of \a __a. \n
+///    100: assign values from bits [79:64] of \a __a. \n
+///    101: assign values from bits [95:80] of \a __a. \n
+///    110: assign values from bits [111:96] of \a __a. \n
+///    111: assign values from bits [127:112] of \a __a.
+/// \returns An integer, whose lower 16 bits are selected from the 128-bit
+///    integer vector parameter and the remaining bits are assigned zeros.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_extract_epi16(__m128i __a, int __imm)
 {
@@ -2283,6 +4160,26 @@ _mm_extract_epi16(__m128i __a, int __imm)
   return (unsigned short)__b[__imm & 7];
 }
 
+/// \brief Constructs a 128-bit integer vector by first making a copy of the
+///    128-bit integer vector parameter, and then inserting the lower 16 bits
+///    of an integer parameter into an offset specified by the immediate-value
+///    parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
+///    result and then one of the eight elements in the result is replaced by
+///    the lower 16 bits of \a __b.
+/// \param __b
+///    An integer. The lower 16 bits of this parameter are written to the
+///    result beginning at an offset specified by \a __imm.
+/// \param __imm
+///    An immediate value specifying the bit offset in the result at which the
+///    lower 16 bits of \a __b are written.
+/// \returns A 128-bit integer vector containing the constructed values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_epi16(__m128i __a, int __b, int __imm)
 {
@@ -2291,18 +4188,85 @@ _mm_insert_epi16(__m128i __a, int __b, int __imm)
   return (__m128i)__c;
 }
 
+/// \brief Copies the values of the most significant bits from each 8-bit
+///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
+///    value, zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bits from each 8-bit element in \a __a,
+///    written to bits [15:0]. The other bits are assigned zeros.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_epi8(__m128i __a)
 {
   return __builtin_ia32_pmovmskb128((__v16qi)__a);
 }
 
+/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
+///    elements of a 128-bit integer vector parameter, using the immediate-value
+///    parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a. The destinations within the 128-bit destination are assigned
+///    values as follows: \n
+///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
+///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
+///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
+///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [31:0] of \a a. \n
+///    01: assign values from bits [63:32] of \a a. \n
+///    10: assign values from bits [95:64] of \a a. \n
+///    11: assign values from bits [127:96] of \a a.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
                                    (__v4si)_mm_undefined_si128(), \
                                    ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
 
+/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
+///    [127:64] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from \a a. \n
+///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
+///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
+///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
+///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [15:0] of \a a. \n
+///    01: assign values from bits [31:16] of \a a. \n
+///    10: assign values from bits [47:32] of \a a. \n
+///    11: assign values from bits [63:48] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
                                    (__v8hi)_mm_undefined_si128(), \
@@ -2310,6 +4274,33 @@ _mm_movemask_epi8(__m128i __a)
                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
                                    4, 5, 6, 7); })
 
+/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
+///    [63:0] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from \a a. \n
+///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
+///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
+///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
+///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [79:64] of \a a. \n
+///    01: assign values from bits [95:80] of \a a. \n
+///    10: assign values from bits [111:96] of \a a. \n
+///    11: assign values from bits [127:112] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
                                    (__v8hi)_mm_undefined_si128(), \
@@ -2319,137 +4310,480 @@ _mm_movemask_epi8(__m128i __a)
                                    4 + (((imm) >> 4) & 0x3), \
                                    4 + (((imm) >> 6) & 0x3)); })
 
+/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
+///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [7:0] of the result. \n
+///    Bits [79:72] are written to bits [23:16] of the result. \n
+///    Bits [87:80] are written to bits [39:32] of the result. \n
+///    Bits [95:88] are written to bits [55:48] of the result. \n
+///    Bits [103:96] are written to bits [71:64] of the result. \n
+///    Bits [111:104] are written to bits [87:80] of the result. \n
+///    Bits [119:112] are written to bits [103:96] of the result. \n
+///    Bits [127:120] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8]. \n
+///    Bits [71:64] are written to bits [15:8] of the result. \n
+///    Bits [79:72] are written to bits [31:24] of the result. \n
+///    Bits [87:80] are written to bits [47:40] of the result. \n
+///    Bits [95:88] are written to bits [63:56] of the result. \n
+///    Bits [103:96] are written to bits [79:72] of the result. \n
+///    Bits [111:104] are written to bits [95:88] of the result. \n
+///    Bits [119:112] are written to bits [111:104] of the result. \n
+///    Bits [127:120] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
+/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
+///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [15:0] of the result. \n
+///    Bits [95:80] are written to bits [47:32] of the result. \n
+///    Bits [111:96] are written to bits [79:64] of the result. \n
+///    Bits [127:112] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [31:16] of the result. \n
+///    Bits [95:80] are written to bits [63:48] of the result. \n
+///    Bits [111:96] are written to bits [95:80] of the result. \n
+///    Bits [127:112] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
 }
 
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [95:64] are written to bits [31:0] of the destination. \n
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [95:64] are written to bits [64:32] of the destination. \n
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
 }
 
+/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8]. \n
+///    Bits [7:0] are written to bits [7:0] of the result. \n
+///    Bits [15:8] are written to bits [23:16] of the result. \n
+///    Bits [23:16] are written to bits [39:32] of the result. \n
+///    Bits [31:24] are written to bits [55:48] of the result. \n
+///    Bits [39:32] are written to bits [71:64] of the result. \n
+///    Bits [47:40] are written to bits [87:80] of the result. \n
+///    Bits [55:48] are written to bits [103:96] of the result. \n
+///    Bits [63:56] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result. \n
+///    Bits [15:8] are written to bits [31:24] of the result. \n
+///    Bits [23:16] are written to bits [47:40] of the result. \n
+///    Bits [31:24] are written to bits [63:56] of the result. \n
+///    Bits [39:32] are written to bits [79:72] of the result. \n
+///    Bits [47:40] are written to bits [95:88] of the result. \n
+///    Bits [55:48] are written to bits [111:104] of the result. \n
+///    Bits [63:56] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
 }
 
+/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
+///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
+///    [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result. \n
+///    Bits [31:16] are written to bits [47:32] of the result. \n
+///    Bits [47:32] are written to bits [79:64] of the result. \n
+///    Bits [63:48] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result. \n
+///    Bits [31:16] are written to bits [63:48] of the result. \n
+///    Bits [47:32] are written to bits [95:80] of the result. \n
+///    Bits [63:48] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
 }
 
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [31:0] are written to bits [31:0] of the destination. \n
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [31:0] are written to bits [64:32] of the destination. \n
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
 }
 
+/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [63:0] are written to bits [63:0] of the destination. \n
+/// \param __b
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [63:0] are written to bits [127:64] of the destination. \n
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
 }
 
+/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+///    integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_movepi64_pi64(__m128i __a)
 {
   return (__m64)__a[0];
 }
 
+/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+///    upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
+///
+/// \param __a
+///    A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_movpi64_epi64(__m64 __a)
 {
   return (__m128i){ (long long)__a, 0 };
 }
 
+/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+///    integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_move_epi64(__m128i __a)
 {
   return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpackhi_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
 }
 
+/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpacklo_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
 }
 
+/// \brief Extracts the sign bits of the double-precision values in the 128-bit
+///    vector of [2 x double], zero-extends the value, and writes it to the
+///    low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values with sign bits to
+///    be extracted.
+/// \returns The sign bits from each of the double-precision elements in \a __a,
+///    written to bits [1:0]. The remaining bits are assigned values of zero.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pd(__m128d __a)
 {
   return __builtin_ia32_movmskpd((__v2df)__a);
 }
 
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
+///    128-bit vector parameters of [2 x double], using the immediate-value
+///     parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param i
+///    An 8-bit immediate value. The least significant two bits specify which
+///    elements to copy from a and b: \n
+///    Bit[0] = 0: lower element of a copied to lower element of result. \n
+///    Bit[0] = 1: upper element of a copied to lower element of result. \n
+///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
+///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
                                    0 + (((i) >> 0) & 0x1), \
                                    2 + (((i) >> 1) & 0x1)); })
 
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castpd_ps(__m128d __a)
 {
   return (__m128)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castpd_si128(__m128d __a)
 {
   return (__m128i)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castps_pd(__m128 __a)
 {
   return (__m128d)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castps_si128(__m128 __a)
 {
   return (__m128i)__a;
 }
 
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castsi128_ps(__m128i __a)
 {
   return (__m128)__a;
 }
 
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castsi128_pd(__m128i __a)
 {
   return (__m128d)__a;
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_pause(void)
-{
-  __builtin_ia32_pause();
-}
+#if defined(__cplusplus)
+extern "C" {
+#endif
 
+/// \brief Indicates that a spin loop is being executed for the purposes of
+///    optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
+///
+void _mm_pause(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
 #undef __DEFAULT_FN_ATTRS
 
 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
diff --git a/lib/Headers/f16cintrin.h b/lib/Headers/f16cintrin.h
index 415bf732fb9f..180712ffc680 100644
--- a/lib/Headers/f16cintrin.h
+++ b/lib/Headers/f16cintrin.h
@@ -37,7 +37,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPH2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
 ///
 /// \param __a
 ///    A 16-bit half-precision float value.
@@ -59,17 +59,17 @@ _cvtsh_ss(unsigned short __a)
 /// unsigned short _cvtss_sh(float a, const int imm);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCVTPS2PH instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
 ///
 /// \param a
 ///    A 32-bit single-precision float value to be converted to a 16-bit
 ///    half-precision float value.
 /// \param imm
-///    An immediate value controlling rounding using bits [2:0]:
-///    000: Nearest
-///    001: Down
-///    010: Up
-///    011: Truncate
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns The converted 16-bit half-precision float value.
 #define _cvtss_sh(a, imm)  \
@@ -85,16 +85,16 @@ _cvtsh_ss(unsigned short __a)
 /// __m128i _mm_cvtps_ph(__m128 a, const int imm);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCVTPS2PH instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector containing 32-bit float values.
 /// \param imm
-///    An immediate value controlling rounding using bits [2:0]:
-///    000: Nearest
-///    001: Down
-///    010: Up
-///    011: Truncate
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns A 128-bit vector containing converted 16-bit half-precision float
 ///    values. The lower 64 bits are used to store the converted 16-bit
@@ -107,7 +107,7 @@ _cvtsh_ss(unsigned short __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPH2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector containing 16-bit half-precision float values. The lower
diff --git a/lib/Headers/float.h b/lib/Headers/float.h
index a28269ebebbe..0f453d87cbcb 100644
--- a/lib/Headers/float.h
+++ b/lib/Headers/float.h
@@ -27,9 +27,12 @@
 /* If we're on MinGW, fall back to the system's float.h, which might have
  * additional definitions provided for Windows.
  * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ *
+ * Also fall back on Darwin to allow additional definitions and
+ * implementation-defined values.
  */
-#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \
-    __has_include_next(<float.h>)
+#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
+    __STDC_HOSTED__ && __has_include_next(<float.h>)
 #  include_next <float.h>
 
 /* Undefine anything that we'll be redefining below. */
diff --git a/lib/Headers/fxsrintrin.h b/lib/Headers/fxsrintrin.h
index ac6026aa5ba2..786081ca8eab 100644
--- a/lib/Headers/fxsrintrin.h
+++ b/lib/Headers/fxsrintrin.h
@@ -30,25 +30,75 @@
 
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
 
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave(void *__p) {
+_fxsave(void *__p)
+{
   return __builtin_ia32_fxsave(__p);
 }
 
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave64(void *__p) {
-  return __builtin_ia32_fxsave64(__p);
+_fxrstor(void *__p)
+{
+  return __builtin_ia32_fxrstor(__p);
 }
 
+#ifdef __x86_64__
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor(void *__p) {
-  return __builtin_ia32_fxrstor(__p);
+_fxsave64(void *__p)
+{
+  return __builtin_ia32_fxsave64(__p);
 }
 
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor64(void *__p) {
+_fxrstor64(void *__p)
+{
   return __builtin_ia32_fxrstor64(__p);
 }
+#endif
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/lib/Headers/ia32intrin.h b/lib/Headers/ia32intrin.h
index 397f3fd13e01..4928300103ad 100644
--- a/lib/Headers/ia32intrin.h
+++ b/lib/Headers/ia32intrin.h
@@ -60,12 +60,6 @@ __rdpmc(int __A) {
   return __builtin_ia32_rdpmc(__A);
 }
 
-/* __rdtsc */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
-__rdtsc(void) {
-  return __builtin_ia32_rdtsc();
-}
-
 /* __rdtscp */
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __rdtscp(unsigned int *__A) {
diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h
index 4b2752353d6f..7f91d49fbcec 100644
--- a/lib/Headers/immintrin.h
+++ b/lib/Headers/immintrin.h
@@ -69,9 +69,44 @@
    Intel documents these as being in immintrin.h, and
    they depend on typedefs from avxintrin.h. */
 
+/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
+///    containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 256-bit vector containing 32-bit single-precision float values to be
+///    converted to 16-bit half-precision float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+///    float values.
 #define _mm256_cvtps_ph(a, imm) __extension__ ({ \
  (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
 
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values to be
+///    converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+///    single-precision float values.
 static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
 _mm256_cvtph_ps(__m128i __a)
 {
diff --git a/lib/Headers/intrin.h b/lib/Headers/intrin.h
index f18711ad1ecf..7c91ebaee8cb 100644
--- a/lib/Headers/intrin.h
+++ b/lib/Headers/intrin.h
@@ -34,6 +34,10 @@
 #include <x86intrin.h>
 #endif
 
+#if defined(__arm__)
+#include <armintr.h>
+#endif
+
 /* For the definition of jmp_buf. */
 #if __STDC_HOSTED__
 #include <setjmp.h>
@@ -62,7 +66,9 @@ void __cpuid(int[4], int);
 static __inline__
 void __cpuidex(int[4], int, int);
 void __debugbreak(void);
+static __inline__
 __int64 __emul(int, int);
+static __inline__
 unsigned __int64 __emulu(unsigned int, unsigned int);
 void __cdecl __fastfail(unsigned int);
 unsigned int __getcallerseflags(void);
@@ -93,6 +99,7 @@ static __inline__
 void __movsd(unsigned long *, unsigned long const *, size_t);
 static __inline__
 void __movsw(unsigned short *, unsigned short const *, size_t);
+static __inline__
 void __nop(void);
 void __nvreg_restore_fence(void);
 void __nvreg_save_fence(void);
@@ -249,10 +256,12 @@ static __inline__
 unsigned long __cdecl _lrotl(unsigned long, int);
 static __inline__
 unsigned long __cdecl _lrotr(unsigned long, int);
-static __inline__
-void _ReadBarrier(void);
-static __inline__
-void _ReadWriteBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void);
 static __inline__
 void *_ReturnAddress(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
@@ -281,8 +290,9 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
 void _Store_HLERelease(long volatile *, long);
 void _Store64_HLERelease(__int64 volatile *, __int64);
 void _StorePointer_HLERelease(void *volatile *, void *);
-static __inline__
-void _WriteBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void);
 unsigned __int32 xbegin(void);
 void _xend(void);
 static __inline__
@@ -307,7 +317,6 @@ void __lwpval64(unsigned __int64, unsigned int, unsigned int);
 unsigned __int64 __lzcnt64(unsigned __int64);
 static __inline__
 void __movsq(unsigned long long *, unsigned long long const *, size_t);
-__int64 __mulh(__int64, __int64);
 static __inline__
 unsigned __int64 __popcnt64(unsigned __int64);
 static __inline__
@@ -378,30 +387,15 @@ void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
                                          void *_Exchange, void *_Comparand);
 void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
                                             void *_Exchange, void *_Comparand);
-static __inline__
-__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-static __inline__
-__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-static __inline__
-__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
 void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
-static __inline__
-__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
 long _InterlockedOr_np(long volatile *_Value, long _Mask);
 short _InterlockedOr16_np(short volatile *_Value, short _Mask);
-static __inline__
-__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
 __int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
 char _InterlockedOr8_np(char volatile *_Value, char _Mask);
 long _InterlockedXor_np(long volatile *_Value, long _Mask);
 short _InterlockedXor16_np(short volatile *_Value, short _Mask);
-static __inline__
-__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
 __int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
 char _InterlockedXor8_np(char volatile *_Value, char _Mask);
-static __inline__
-__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
-                __int64 *_HighProduct);
 unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
 __int64 _sarx_i64(__int64, unsigned int);
 #if __STDC_HOSTED__
@@ -409,119 +403,44 @@ int __cdecl _setjmpex(jmp_buf);
 #endif
 unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
-/*
- * Multiply two 64-bit integers and obtain a 64-bit result.
- * The low-half is returned directly and the high half is in an out parameter.
- */
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
-         unsigned __int64 *_HighProduct) {
-  unsigned __int128 _FullProduct =
-      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
-  *_HighProduct = _FullProduct >> 64;
-  return _FullProduct;
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
-  unsigned __int128 _FullProduct =
-      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
-  return _FullProduct >> 64;
-}
+static __inline__
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
+static __inline__
+__int64 _mul128(__int64, __int64, __int64*);
+static __inline__
+unsigned __int64 _umul128(unsigned __int64,
+                          unsigned __int64,
+                          unsigned __int64*);
 
 #endif /* __x86_64__ */
 
-/*----------------------------------------------------------------------------*\
-|* Multiplication
-\*----------------------------------------------------------------------------*/
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-__emul(int __in1, int __in2) {
-  return (__int64)__in1 * (__int64)__in2;
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__emulu(unsigned int __in1, unsigned int __in2) {
-  return (unsigned __int64)__in1 * (unsigned __int64)__in2;
-}
-/*----------------------------------------------------------------------------*\
-|* Bit Twiddling
-\*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_rotl8(unsigned char _Value, unsigned char _Shift) {
-  _Shift &= 0x7;
-  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_rotr8(unsigned char _Value, unsigned char _Shift) {
-  _Shift &= 0x7;
-  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-_rotl16(unsigned short _Value, unsigned char _Shift) {
-  _Shift &= 0xf;
-  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-_rotr16(unsigned short _Value, unsigned char _Shift) {
-  _Shift &= 0xf;
-  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rotl(unsigned int _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rotr(unsigned int _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-_lrotl(unsigned long _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-_lrotr(unsigned long _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
-}
-static
-__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_rotl64(unsigned __int64 _Value, int _Shift) {
-  _Shift &= 0x3f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
-}
-static
-__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_rotr64(unsigned __int64 _Value, int _Shift) {
-  _Shift &= 0x3f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
-}
+#if defined(__x86_64__) || defined(__arm__)
+
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
+
+#endif
+
 /*----------------------------------------------------------------------------*\
 |* Bit Counting and Testing
 \*----------------------------------------------------------------------------*/
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = __builtin_ctzl(_Mask);
-  return 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = 31 - __builtin_clzl(_Mask);
-  return 1;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__popcnt16(unsigned short _Value) {
-  return __builtin_popcount((int)_Value);
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__popcnt(unsigned int _Value) {
-  return __builtin_popcount(_Value);
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _bittest(long const *_BitBase, long _BitPos) {
   return (*_BitBase >> _BitPos) & 1;
 }
@@ -548,26 +467,24 @@ _interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
   long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
   return (_PrevVal >> _BitPos) & 1;
 }
-#ifdef __x86_64__
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = __builtin_ctzll(_Mask);
-  return 1;
+_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE);
+  return (_PrevVal >> _BitPos) & 1;
 }
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = 63 - __builtin_clzll(_Mask);
-  return 1;
+_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED);
+  return (_PrevVal >> _BitPos) & 1;
 }
-static __inline__
-unsigned __int64 __DEFAULT_FN_ATTRS
-__popcnt64(unsigned __int64 _Value) {
-  return __builtin_popcountll(_Value);
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE);
+  return (_PrevVal >> _BitPos) & 1;
 }
+#endif
+#ifdef __x86_64__
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _bittest64(__int64 const *_BitBase, __int64 _BitPos) {
   return (*_BitBase >> _BitPos) & 1;
@@ -600,196 +517,449 @@ _interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
 /*----------------------------------------------------------------------------*\
 |* Interlocked Exchange Add
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
 }
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
 }
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange Sub
-\*----------------------------------------------------------------------------*/
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
 }
 static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
 }
-#ifdef __x86_64__
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Increment
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedIncrement16(short volatile *_Value) {
-  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedIncrement16_acq(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_nf(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_rel(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_acq(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_nf(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_rel(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
 }
-#ifdef __x86_64__
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedIncrement64(__int64 volatile *_Value) {
-  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedIncrement64_acq(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_nf(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_rel(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Decrement
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedDecrement16(short volatile *_Value) {
-  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedDecrement16_acq(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_nf(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_rel(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_acq(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_nf(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_rel(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_acq(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
 }
-#ifdef __x86_64__
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedDecrement64(__int64 volatile *_Value) {
-  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedDecrement64_nf(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_rel(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked And
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedAnd8(char volatile *_Value, char _Mask) {
-  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedAnd16(short volatile *_Value, short _Mask) {
-  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedAnd(long volatile *_Value, long _Mask) {
-  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
 }
-#ifdef __x86_64__
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Or
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedOr8(char volatile *_Value, char _Mask) {
-  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedOr16(short volatile *_Value, short _Mask) {
-  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedOr(long volatile *_Value, long _Mask) {
-  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
 }
-#ifdef __x86_64__
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Xor
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedXor8(char volatile *_Value, char _Mask) {
-  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedXor16(short volatile *_Value, short _Mask) {
-  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedXor(long volatile *_Value, long _Mask) {
-  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
 }
-#ifdef __x86_64__
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Exchange
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchange8(char volatile *_Target, char _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange8_acq(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_nf(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_rel(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
   return _Value;
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchange16(short volatile *_Target, short _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange16_acq(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_nf(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_rel(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_acq(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_nf(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_rel(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
   return _Value;
 }
-#ifdef __x86_64__
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
   return _Value;
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Compare Exchange
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange8(char volatile *_Destination,
+_InterlockedCompareExchange8_acq(char volatile *_Destination,
                              char _Exchange, char _Comparand) {
   __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_nf(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_rel(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
   return _Comparand;
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange16(short volatile *_Destination,
+_InterlockedCompareExchange16_acq(short volatile *_Destination,
                               short _Exchange, short _Comparand) {
   __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
   return _Comparand;
 }
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange64(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand) {
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_nf(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
   __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
   return _Comparand;
 }
-/*----------------------------------------------------------------------------*\
-|* Barriers
-\*----------------------------------------------------------------------------*/
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadWriteBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_rel(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_acq(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_WriteBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_nf(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
 }
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__faststorefence(void) {
-  __atomic_thread_fence(__ATOMIC_SEQ_CST);
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_rel(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
 }
 #endif
 /*----------------------------------------------------------------------------*\
@@ -840,59 +1010,39 @@ __readgsqword(unsigned long __offset) {
 #if defined(__i386__) || defined(__x86_64__)
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
-  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n));
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
-  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n));
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
-  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n));
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
-  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n));
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n));
 }
 #endif
 #ifdef __x86_64__
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
-  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n));
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
-  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n));
 }
 #endif
 
 /*----------------------------------------------------------------------------*\
 |* Misc
 \*----------------------------------------------------------------------------*/
-static __inline__ void * __DEFAULT_FN_ATTRS
-_AddressOfReturnAddress(void) {
-  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
-}
-static __inline__ void * __DEFAULT_FN_ATTRS
-_ReturnAddress(void) {
-  return __builtin_return_address(0);
-}
 #if defined(__i386__) || defined(__x86_64__)
 static __inline__ void __DEFAULT_FN_ATTRS
 __cpuid(int __info[4], int __level) {
@@ -914,6 +1064,10 @@ static __inline__ void __DEFAULT_FN_ATTRS
 __halt(void) {
   __asm__ volatile ("hlt");
 }
+static __inline__ void __DEFAULT_FN_ATTRS
+__nop(void) {
+  __asm__ volatile ("nop");
+}
 #endif
 
 /*----------------------------------------------------------------------------*\
diff --git a/lib/Headers/lzcntintrin.h b/lib/Headers/lzcntintrin.h
index 4c00e42ac3a9..3d2769da3bae 100644
--- a/lib/Headers/lzcntintrin.h
+++ b/lib/Headers/lzcntintrin.h
@@ -31,18 +31,48 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
 
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of leading zero
+///    bits in the operand.
 static __inline__ unsigned short __DEFAULT_FN_ATTRS
 __lzcnt16(unsigned short __X)
 {
   return __X ? __builtin_clzs(__X) : 16;
 }
 
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __lzcnt32(unsigned int __X)
 {
   return __X ? __builtin_clz(__X) : 32;
 }
 
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _lzcnt_u32(unsigned int __X)
 {
@@ -50,12 +80,32 @@ _lzcnt_u32(unsigned int __X)
 }
 
 #ifdef __x86_64__
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __lzcnt64(unsigned long long __X)
 {
   return __X ? __builtin_clzll(__X) : 64;
 }
 
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _lzcnt_u64(unsigned long long __X)
 {
diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h
index cefd6053aa80..e0c277a65a33 100644
--- a/lib/Headers/mmintrin.h
+++ b/lib/Headers/mmintrin.h
@@ -39,7 +39,7 @@ typedef char __v8qi __attribute__((__vector_size__(8)));
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c EMMS instruction.
+/// This intrinsic corresponds to the <c> EMMS </c> instruction.
 ///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_empty(void)
@@ -52,7 +52,7 @@ _mm_empty(void)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
 ///
 /// \param __i
 ///    A 32-bit integer value.
@@ -69,7 +69,7 @@ _mm_cvtsi32_si64(int __i)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector.
@@ -85,7 +85,7 @@ _mm_cvtsi64_si32(__m64 __m)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
 ///
 /// \param __i
 ///    A 64-bit signed integer.
@@ -101,7 +101,7 @@ _mm_cvtsi64_m64(long long __i)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector.
@@ -121,7 +121,7 @@ _mm_cvtm64_si64(__m64 __m)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PACKSSWB instruction.
+/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
@@ -151,7 +151,7 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PACKSSDW instruction.
+/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
@@ -181,7 +181,7 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PACKUSWB instruction.
+/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
@@ -208,19 +208,19 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKHBW instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [8 x i8].
-///    Bits [39:32] are written to bits [7:0] of the result.
-///    Bits [47:40] are written to bits [23:16] of the result.
-///    Bits [55:48] are written to bits [39:32] of the result.
+///    A 64-bit integer vector of [8 x i8]. \n 
+///    Bits [39:32] are written to bits [7:0] of the result. \n
+///    Bits [47:40] are written to bits [23:16] of the result. \n
+///    Bits [55:48] are written to bits [39:32] of the result. \n
 ///    Bits [63:56] are written to bits [55:48] of the result.
 /// \param __m2
 ///    A 64-bit integer vector of [8 x i8].
-///    Bits [39:32] are written to bits [15:8] of the result.
-///    Bits [47:40] are written to bits [31:24] of the result.
-///    Bits [55:48] are written to bits [47:40] of the result.
+///    Bits [39:32] are written to bits [15:8] of the result. \n
+///    Bits [47:40] are written to bits [31:24] of the result. \n
+///    Bits [55:48] are written to bits [47:40] of the result. \n
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
@@ -235,15 +235,15 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKHWD instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
-///    Bits [47:32] are written to bits [15:0] of the result.
+///    Bits [47:32] are written to bits [15:0] of the result. \n
 ///    Bits [63:48] are written to bits [47:32] of the result.
 /// \param __m2
 ///    A 64-bit integer vector of [4 x i16].
-///    Bits [47:32] are written to bits [31:16] of the result.
+///    Bits [47:32] are written to bits [31:16] of the result. \n
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
@@ -258,7 +258,7 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKHDQ instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
@@ -279,19 +279,19 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKLBW instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
-///    Bits [7:0] are written to bits [7:0] of the result.
-///    Bits [15:8] are written to bits [23:16] of the result.
-///    Bits [23:16] are written to bits [39:32] of the result.
+///    Bits [7:0] are written to bits [7:0] of the result. \n
+///    Bits [15:8] are written to bits [23:16] of the result. \n
+///    Bits [23:16] are written to bits [39:32] of the result. \n
 ///    Bits [31:24] are written to bits [55:48] of the result.
 /// \param __m2
 ///    A 64-bit integer vector of [8 x i8].
-///    Bits [7:0] are written to bits [15:8] of the result.
-///    Bits [15:8] are written to bits [31:24] of the result.
-///    Bits [23:16] are written to bits [47:40] of the result.
+///    Bits [7:0] are written to bits [15:8] of the result. \n
+///    Bits [15:8] are written to bits [31:24] of the result. \n
+///    Bits [23:16] are written to bits [47:40] of the result. \n
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
@@ -306,15 +306,15 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKLWD instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
-///    Bits [15:0] are written to bits [15:0] of the result.
+///    Bits [15:0] are written to bits [15:0] of the result. \n
 ///    Bits [31:16] are written to bits [47:32] of the result.
 /// \param __m2
 ///    A 64-bit integer vector of [4 x i16].
-///    Bits [15:0] are written to bits [31:16] of the result.
+///    Bits [15:0] are written to bits [31:16] of the result. \n
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
@@ -329,7 +329,7 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKLDQ instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
@@ -352,7 +352,7 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDB instruction.
+/// This intrinsic corresponds to the <c> PADDB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -373,7 +373,7 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDW instruction.
+/// This intrinsic corresponds to the <c> PADDW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -394,7 +394,7 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDD instruction.
+/// This intrinsic corresponds to the <c> PADDD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32].
@@ -416,7 +416,7 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDSB instruction.
+/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -439,7 +439,7 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDSW instruction.
+/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -461,7 +461,7 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDUSB instruction.
+/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -483,7 +483,7 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDUSW instruction.
+/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -504,7 +504,7 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBB instruction.
+/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -525,7 +525,7 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBW instruction.
+/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -546,7 +546,7 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBD instruction.
+/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32] containing the minuends.
@@ -569,7 +569,7 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBSB instruction.
+/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -592,7 +592,7 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBSW instruction.
+/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -615,7 +615,7 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBUSB instruction.
+/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -638,7 +638,7 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBUSW instruction.
+/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -663,7 +663,7 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMADDWD instruction.
+/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -684,7 +684,7 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMULHW instruction.
+/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -705,7 +705,7 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMULLW instruction.
+/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -727,14 +727,15 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLW instruction.
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-///    values. If __count is greater or equal to 16, the result is set to all 0.
+///    values. If \a __count is greater or equal to 16, the result is set to all
+///    0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
@@ -748,14 +749,15 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLW instruction.
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-///    values. If __count is greater or equal to 16, the result is set to all 0.
+///    values. If \a __count is greater or equal to 16, the result is set to all
+///    0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi16(__m64 __m, int __count)
 {
@@ -770,14 +772,15 @@ _mm_slli_pi16(__m64 __m, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLD instruction.
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-///    values. If __count is greater or equal to 32, the result is set to all 0.
+///    values. If \a __count is greater or equal to 32, the result is set to all
+///    0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
@@ -791,14 +794,15 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLD instruction.
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-///    values. If __count is greater or equal to 32, the result is set to all 0.
+///    values. If \a __count is greater or equal to 32, the result is set to all
+///    0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi32(__m64 __m, int __count)
 {
@@ -811,14 +815,14 @@ _mm_slli_pi32(__m64 __m, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLQ instruction.
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
-///     __count is greater or equal to 64, the result is set to 0.
+///     \a __count is greater or equal to 64, the result is set to 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
@@ -831,14 +835,14 @@ _mm_sll_si64(__m64 __m, __m64 __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLQ instruction.
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
-///     __count is greater or equal to 64, the result is set to 0.
+///     \a __count is greater or equal to 64, the result is set to 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_si64(__m64 __m, int __count)
 {
@@ -854,7 +858,7 @@ _mm_slli_si64(__m64 __m, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRAW instruction.
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
@@ -876,7 +880,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRAW instruction.
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
@@ -899,7 +903,7 @@ _mm_srai_pi16(__m64 __m, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRAD instruction.
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
@@ -921,7 +925,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRAD instruction.
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
@@ -943,7 +947,7 @@ _mm_srai_pi32(__m64 __m, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLW instruction.
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
@@ -964,7 +968,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLW instruction.
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
@@ -986,7 +990,7 @@ _mm_srli_pi16(__m64 __m, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLD instruction.
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
@@ -1007,7 +1011,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLD instruction.
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
@@ -1027,7 +1031,7 @@ _mm_srli_pi32(__m64 __m, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLQ instruction.
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
@@ -1046,7 +1050,7 @@ _mm_srl_si64(__m64 __m, __m64 __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLQ instruction.
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
@@ -1063,7 +1067,7 @@ _mm_srli_si64(__m64 __m, int __count)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PAND instruction.
+/// This intrinsic corresponds to the <c> PAND </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector.
@@ -1083,7 +1087,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PANDN instruction.
+/// This intrinsic corresponds to the <c> PANDN </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector. The one's complement of this parameter is used
@@ -1102,7 +1106,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POR instruction.
+/// This intrinsic corresponds to the <c> POR </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector.
@@ -1120,7 +1124,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PXOR instruction.
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector.
@@ -1141,7 +1145,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPEQB instruction.
+/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -1162,7 +1166,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPEQW instruction.
+/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -1183,7 +1187,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPEQD instruction.
+/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32].
@@ -1204,7 +1208,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -1225,7 +1229,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -1246,7 +1250,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32].
@@ -1264,7 +1268,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -1356,7 +1360,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction.
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
 ///
 /// \param __i
 ///    A 32-bit integer value used to initialize each vector element of the
@@ -1374,7 +1378,7 @@ _mm_set1_pi32(int __i)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction.
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
 ///
 /// \param __w
 ///    A 16-bit integer value used to initialize each vector element of the
@@ -1391,8 +1395,8 @@ _mm_set1_pi16(short __w)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPUNPCKLBW + VPSHUFLW / \c PUNPCKLBW +
-///    PSHUFLW instruction.
+/// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW +
+///    PSHUFLW </c> instruction.
 ///
 /// \param __b
 ///    An 8-bit integer value used to initialize each vector element of the
diff --git a/lib/Headers/module.modulemap b/lib/Headers/module.modulemap
index 3e40d2c08d8c..11ef2f902945 100644
--- a/lib/Headers/module.modulemap
+++ b/lib/Headers/module.modulemap
@@ -63,11 +63,13 @@ module _Builtin_intrinsics [system] [extern_c] {
     textual header "mwaitxintrin.h"
 
     explicit module mm_malloc {
+      requires !freestanding
       header "mm_malloc.h"
       export * // note: for <stdlib.h> dependency
     }
 
     explicit module cpuid {
+      requires gnuinlineasm
       header "cpuid.h"
     }
 
diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h
index 802927490e7f..0c25d312709d 100644
--- a/lib/Headers/opencl-c.h
+++ b/lib/Headers/opencl-c.h
@@ -17,6 +17,7 @@
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 #define __ovld __attribute__((overloadable))
+#define __conv __attribute__((convergent))
 
 // Optimizations
 #define __purefn __attribute__((pure))
@@ -9810,14 +9811,6 @@ float3 __ovld __cnfn native_cos(float3 x);
 float4 __ovld __cnfn native_cos(float4 x);
 float8 __ovld __cnfn native_cos(float8 x);
 float16 __ovld __cnfn native_cos(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_cos(double x);
-double2 __ovld __cnfn native_cos(double2 x);
-double3 __ovld __cnfn native_cos(double3 x);
-double4 __ovld __cnfn native_cos(double4 x);
-double8 __ovld __cnfn native_cos(double8 x);
-double16 __ovld __cnfn native_cos(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute x / y over an implementation-defined range.
@@ -9829,14 +9822,6 @@ float3 __ovld __cnfn native_divide(float3 x, float3 y);
 float4 __ovld __cnfn native_divide(float4 x, float4 y);
 float8 __ovld __cnfn native_divide(float8 x, float8 y);
 float16 __ovld __cnfn native_divide(float16 x, float16 y);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_divide(double x, double y);
-double2 __ovld __cnfn native_divide(double2 x, double2 y);
-double3 __ovld __cnfn native_divide(double3 x, double3 y);
-double4 __ovld __cnfn native_divide(double4 x, double4 y);
-double8 __ovld __cnfn native_divide(double8 x, double8 y);
-double16 __ovld __cnfn native_divide(double16 x, double16 y);
-#endif //cl_khr_fp64
 
 /**
  * Compute the base- e exponential of x over an
@@ -9849,14 +9834,6 @@ float3 __ovld __cnfn native_exp(float3 x);
 float4 __ovld __cnfn native_exp(float4 x);
 float8 __ovld __cnfn native_exp(float8 x);
 float16 __ovld __cnfn native_exp(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp(double x);
-double2 __ovld __cnfn native_exp(double2 x);
-double3 __ovld __cnfn native_exp(double3 x);
-double4 __ovld __cnfn native_exp(double4 x);
-double8 __ovld __cnfn native_exp(double8 x);
-double16 __ovld __cnfn native_exp(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute the base- 2 exponential of x over an
@@ -9869,14 +9846,6 @@ float3 __ovld __cnfn native_exp2(float3 x);
 float4 __ovld __cnfn native_exp2(float4 x);
 float8 __ovld __cnfn native_exp2(float8 x);
 float16 __ovld __cnfn native_exp2(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp2(double x);
-double2 __ovld __cnfn native_exp2(double2 x);
-double3 __ovld __cnfn native_exp2(double3 x);
-double4 __ovld __cnfn native_exp2(double4 x);
-double8 __ovld __cnfn native_exp2(double8 x);
-double16 __ovld __cnfn native_exp2(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute the base- 10 exponential of x over an
@@ -9889,14 +9858,6 @@ float3 __ovld __cnfn native_exp10(float3 x);
 float4 __ovld __cnfn native_exp10(float4 x);
 float8 __ovld __cnfn native_exp10(float8 x);
 float16 __ovld __cnfn native_exp10(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp10(double x);
-double2 __ovld __cnfn native_exp10(double2 x);
-double3 __ovld __cnfn native_exp10(double3 x);
-double4 __ovld __cnfn native_exp10(double4 x);
-double8 __ovld __cnfn native_exp10(double8 x);
-double16 __ovld __cnfn native_exp10(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute natural logarithm over an implementationdefined
@@ -9909,14 +9870,6 @@ float3 __ovld __cnfn native_log(float3 x);
 float4 __ovld __cnfn native_log(float4 x);
 float8 __ovld __cnfn native_log(float8 x);
 float16 __ovld __cnfn native_log(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log(double x);
-double2 __ovld __cnfn native_log(double2 x);
-double3 __ovld __cnfn native_log(double3 x);
-double4 __ovld __cnfn native_log(double4 x);
-double8 __ovld __cnfn native_log(double8 x);
-double16 __ovld __cnfn native_log(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute a base 2 logarithm over an implementationdefined
@@ -9928,14 +9881,6 @@ float3 __ovld __cnfn native_log2(float3 x);
 float4 __ovld __cnfn native_log2(float4 x);
 float8 __ovld __cnfn native_log2(float8 x);
 float16 __ovld __cnfn native_log2(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log2(double x);
-double2 __ovld __cnfn native_log2(double2 x);
-double3 __ovld __cnfn native_log2(double3 x);
-double4 __ovld __cnfn native_log2(double4 x);
-double8 __ovld __cnfn native_log2(double8 x);
-double16 __ovld __cnfn native_log2(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute a base 10 logarithm over an implementationdefined
@@ -9947,14 +9892,6 @@ float3 __ovld __cnfn native_log10(float3 x);
 float4 __ovld __cnfn native_log10(float4 x);
 float8 __ovld __cnfn native_log10(float8 x);
 float16 __ovld __cnfn native_log10(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log10(double x);
-double2 __ovld __cnfn native_log10(double2 x);
-double3 __ovld __cnfn native_log10(double3 x);
-double4 __ovld __cnfn native_log10(double4 x);
-double8 __ovld __cnfn native_log10(double8 x);
-double16 __ovld __cnfn native_log10(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute x to the power y, where x is >= 0. The range of
@@ -9967,14 +9904,6 @@ float3 __ovld __cnfn native_powr(float3 x, float3 y);
 float4 __ovld __cnfn native_powr(float4 x, float4 y);
 float8 __ovld __cnfn native_powr(float8 x, float8 y);
 float16 __ovld __cnfn native_powr(float16 x, float16 y);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_powr(double x, double y);
-double2 __ovld __cnfn native_powr(double2 x, double2 y);
-double3 __ovld __cnfn native_powr(double3 x, double3 y);
-double4 __ovld __cnfn native_powr(double4 x, double4 y);
-double8 __ovld __cnfn native_powr(double8 x, double8 y);
-double16 __ovld __cnfn native_powr(double16 x, double16 y);
-#endif //cl_khr_fp64
 
 /**
  * Compute reciprocal over an implementation-defined
@@ -9986,14 +9915,6 @@ float3 __ovld __cnfn native_recip(float3 x);
 float4 __ovld __cnfn native_recip(float4 x);
 float8 __ovld __cnfn native_recip(float8 x);
 float16 __ovld __cnfn native_recip(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_recip(double x);
-double2 __ovld __cnfn native_recip(double2 x);
-double3 __ovld __cnfn native_recip(double3 x);
-double4 __ovld __cnfn native_recip(double4 x);
-double8 __ovld __cnfn native_recip(double8 x);
-double16 __ovld __cnfn native_recip(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute inverse square root over an implementationdefined
@@ -10005,14 +9926,6 @@ float3 __ovld __cnfn native_rsqrt(float3 x);
 float4 __ovld __cnfn native_rsqrt(float4 x);
 float8 __ovld __cnfn native_rsqrt(float8 x);
 float16 __ovld __cnfn native_rsqrt(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_rsqrt(double x);
-double2 __ovld __cnfn native_rsqrt(double2 x);
-double3 __ovld __cnfn native_rsqrt(double3 x);
-double4 __ovld __cnfn native_rsqrt(double4 x);
-double8 __ovld __cnfn native_rsqrt(double8 x);
-double16 __ovld __cnfn native_rsqrt(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute sine over an implementation-defined range.
@@ -10024,14 +9937,6 @@ float3 __ovld __cnfn native_sin(float3 x);
 float4 __ovld __cnfn native_sin(float4 x);
 float8 __ovld __cnfn native_sin(float8 x);
 float16 __ovld __cnfn native_sin(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_sin(double x);
-double2 __ovld __cnfn native_sin(double2 x);
-double3 __ovld __cnfn native_sin(double3 x);
-double4 __ovld __cnfn native_sin(double4 x);
-double8 __ovld __cnfn native_sin(double8 x);
-double16 __ovld __cnfn native_sin(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute square root over an implementation-defined
@@ -10043,14 +9948,6 @@ float3 __ovld __cnfn native_sqrt(float3 x);
 float4 __ovld __cnfn native_sqrt(float4 x);
 float8 __ovld __cnfn native_sqrt(float8 x);
 float16 __ovld __cnfn native_sqrt(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_sqrt(double x);
-double2 __ovld __cnfn native_sqrt(double2 x);
-double3 __ovld __cnfn native_sqrt(double3 x);
-double4 __ovld __cnfn native_sqrt(double4 x);
-double8 __ovld __cnfn native_sqrt(double8 x);
-double16 __ovld __cnfn native_sqrt(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute tangent over an implementation-defined range.
@@ -10062,14 +9959,6 @@ float3 __ovld __cnfn native_tan(float3 x);
 float4 __ovld __cnfn native_tan(float4 x);
 float8 __ovld __cnfn native_tan(float8 x);
 float16 __ovld __cnfn native_tan(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_tan(double x);
-double2 __ovld __cnfn native_tan(double2 x);
-double3 __ovld __cnfn native_tan(double3 x);
-double4 __ovld __cnfn native_tan(double4 x);
-double8 __ovld __cnfn native_tan(double8 x);
-double16 __ovld __cnfn native_tan(double16 x);
-#endif //cl_khr_fp64
 
 // OpenCL v1.1 s6.11.3, v1.2 s6.12.3, v2.0 s6.13.3 - Integer Functions
 
@@ -13934,7 +13823,7 @@ typedef uint cl_mem_fence_flags;
  * image objects and then want to read the updated data.
  */
 
-void __ovld barrier(cl_mem_fence_flags flags);
+void __ovld __conv barrier(cl_mem_fence_flags flags);
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
@@ -13947,8 +13836,8 @@ typedef enum memory_scope
   memory_scope_sub_group
 } memory_scope;
 
-void __ovld work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
-void __ovld work_group_barrier(cl_mem_fence_flags flags);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
@@ -14728,6 +14617,13 @@ int __ovld atom_xor(volatile __local int *p, int val);
 unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val);
 #endif
 
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_xor(volatile __global long *p, long val);
+unsigned long __ovld atom_xor(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_xor(volatile __local long *p, long val);
+unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long val);
+#endif
+
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
@@ -15564,9 +15460,11 @@ half16 __ovld __cnfn shuffle2(half8 x, half8 y, ushort16 mask);
 half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
 #endif //cl_khr_fp16
 
+#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
 
 int printf(__constant const char* st, ...);
+#endif
 
 // OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
 
@@ -15592,6 +15490,10 @@ int printf(__constant const char* st, ...);
 #define CLK_FILTER_NEAREST              0x10
 #define CLK_FILTER_LINEAR               0x20
 
+#ifdef cl_khr_gl_msaa_sharing
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
+#endif //cl_khr_gl_msaa_sharing
+
 /**
  * Use the coordinate (coord.xy) to do an element lookup in
  * the 2D image object specified by image.
@@ -16493,6 +16395,7 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept
 #define CLK_sRGBA             0x10C1
 #define CLK_sRGBx             0x10C0
 #define CLK_sBGRA             0x10C2
+#define CLK_ABGR              0x10C3
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
@@ -16670,101 +16573,101 @@ int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
 // OpenCL v2.0 s6.13.15 - Work-group Functions
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-int __ovld work_group_all(int predicate);
-int __ovld work_group_any(int predicate);
+int __ovld __conv work_group_all(int predicate);
+int __ovld __conv work_group_any(int predicate);
 
 #ifdef cl_khr_fp16
-half __ovld work_group_broadcast(half a, size_t local_id);
-half __ovld work_group_broadcast(half a, size_t x, size_t y);
-half __ovld work_group_broadcast(half a, size_t x, size_t y, size_t z);
+half __ovld __conv work_group_broadcast(half a, size_t local_id);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y, size_t z);
 #endif
-int __ovld work_group_broadcast(int a, size_t local_id);
-int __ovld work_group_broadcast(int a, size_t x, size_t y);
-int __ovld work_group_broadcast(int a, size_t x, size_t y, size_t z);
-uint __ovld work_group_broadcast(uint a, size_t local_id);
-uint __ovld work_group_broadcast(uint a, size_t x, size_t y);
-uint __ovld work_group_broadcast(uint a, size_t x, size_t y, size_t z);
-long __ovld work_group_broadcast(long a, size_t local_id);
-long __ovld work_group_broadcast(long a, size_t x, size_t y);
-long __ovld work_group_broadcast(long a, size_t x, size_t y, size_t z);
-ulong __ovld work_group_broadcast(ulong a, size_t local_id);
-ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y);
-ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
-float __ovld work_group_broadcast(float a, size_t local_id);
-float __ovld work_group_broadcast(float a, size_t x, size_t y);
-float __ovld work_group_broadcast(float a, size_t x, size_t y, size_t z);
+int __ovld __conv work_group_broadcast(int a, size_t local_id);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y, size_t z);
+uint __ovld __conv work_group_broadcast(uint a, size_t local_id);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y, size_t z);
+long __ovld __conv work_group_broadcast(long a, size_t local_id);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y, size_t z);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t local_id);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
+float __ovld __conv work_group_broadcast(float a, size_t local_id);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z);
 #ifdef cl_khr_fp64
-double __ovld work_group_broadcast(double a, size_t local_id);
-double __ovld work_group_broadcast(double a, size_t x, size_t y);
-double __ovld work_group_broadcast(double a, size_t x, size_t y, size_t z);
+double __ovld __conv work_group_broadcast(double a, size_t local_id);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y, size_t z);
 #endif //cl_khr_fp64
 
 #ifdef cl_khr_fp16
-half __ovld work_group_reduce_add(half x);
-half __ovld work_group_reduce_min(half x);
-half __ovld work_group_reduce_max(half x);
-half __ovld work_group_scan_exclusive_add(half x);
-half __ovld work_group_scan_exclusive_min(half x);
-half __ovld work_group_scan_exclusive_max(half x);
-half __ovld work_group_scan_inclusive_add(half x);
-half __ovld work_group_scan_inclusive_min(half x);
-half __ovld work_group_scan_inclusive_max(half x);
+half __ovld __conv work_group_reduce_add(half x);
+half __ovld __conv work_group_reduce_min(half x);
+half __ovld __conv work_group_reduce_max(half x);
+half __ovld __conv work_group_scan_exclusive_add(half x);
+half __ovld __conv work_group_scan_exclusive_min(half x);
+half __ovld __conv work_group_scan_exclusive_max(half x);
+half __ovld __conv work_group_scan_inclusive_add(half x);
+half __ovld __conv work_group_scan_inclusive_min(half x);
+half __ovld __conv work_group_scan_inclusive_max(half x);
 #endif
-int __ovld work_group_reduce_add(int x);
-int __ovld work_group_reduce_min(int x);
-int __ovld work_group_reduce_max(int x);
-int __ovld work_group_scan_exclusive_add(int x);
-int __ovld work_group_scan_exclusive_min(int x);
-int __ovld work_group_scan_exclusive_max(int x);
-int __ovld work_group_scan_inclusive_add(int x);
-int __ovld work_group_scan_inclusive_min(int x);
-int __ovld work_group_scan_inclusive_max(int x);
-uint __ovld work_group_reduce_add(uint x);
-uint __ovld work_group_reduce_min(uint x);
-uint __ovld work_group_reduce_max(uint x);
-uint __ovld work_group_scan_exclusive_add(uint x);
-uint __ovld work_group_scan_exclusive_min(uint x);
-uint __ovld work_group_scan_exclusive_max(uint x);
-uint __ovld work_group_scan_inclusive_add(uint x);
-uint __ovld work_group_scan_inclusive_min(uint x);
-uint __ovld work_group_scan_inclusive_max(uint x);
-long __ovld work_group_reduce_add(long x);
-long __ovld work_group_reduce_min(long x);
-long __ovld work_group_reduce_max(long x);
-long __ovld work_group_scan_exclusive_add(long x);
-long __ovld work_group_scan_exclusive_min(long x);
-long __ovld work_group_scan_exclusive_max(long x);
-long __ovld work_group_scan_inclusive_add(long x);
-long __ovld work_group_scan_inclusive_min(long x);
-long __ovld work_group_scan_inclusive_max(long x);
-ulong __ovld work_group_reduce_add(ulong x);
-ulong __ovld work_group_reduce_min(ulong x);
-ulong __ovld work_group_reduce_max(ulong x);
-ulong __ovld work_group_scan_exclusive_add(ulong x);
-ulong __ovld work_group_scan_exclusive_min(ulong x);
-ulong __ovld work_group_scan_exclusive_max(ulong x);
-ulong __ovld work_group_scan_inclusive_add(ulong x);
-ulong __ovld work_group_scan_inclusive_min(ulong x);
-ulong __ovld work_group_scan_inclusive_max(ulong x);
-float __ovld work_group_reduce_add(float x);
-float __ovld work_group_reduce_min(float x);
-float __ovld work_group_reduce_max(float x);
-float __ovld work_group_scan_exclusive_add(float x);
-float __ovld work_group_scan_exclusive_min(float x);
-float __ovld work_group_scan_exclusive_max(float x);
-float __ovld work_group_scan_inclusive_add(float x);
-float __ovld work_group_scan_inclusive_min(float x);
-float __ovld work_group_scan_inclusive_max(float x);
+int __ovld __conv work_group_reduce_add(int x);
+int __ovld __conv work_group_reduce_min(int x);
+int __ovld __conv work_group_reduce_max(int x);
+int __ovld __conv work_group_scan_exclusive_add(int x);
+int __ovld __conv work_group_scan_exclusive_min(int x);
+int __ovld __conv work_group_scan_exclusive_max(int x);
+int __ovld __conv work_group_scan_inclusive_add(int x);
+int __ovld __conv work_group_scan_inclusive_min(int x);
+int __ovld __conv work_group_scan_inclusive_max(int x);
+uint __ovld __conv work_group_reduce_add(uint x);
+uint __ovld __conv work_group_reduce_min(uint x);
+uint __ovld __conv work_group_reduce_max(uint x);
+uint __ovld __conv work_group_scan_exclusive_add(uint x);
+uint __ovld __conv work_group_scan_exclusive_min(uint x);
+uint __ovld __conv work_group_scan_exclusive_max(uint x);
+uint __ovld __conv work_group_scan_inclusive_add(uint x);
+uint __ovld __conv work_group_scan_inclusive_min(uint x);
+uint __ovld __conv work_group_scan_inclusive_max(uint x);
+long __ovld __conv work_group_reduce_add(long x);
+long __ovld __conv work_group_reduce_min(long x);
+long __ovld __conv work_group_reduce_max(long x);
+long __ovld __conv work_group_scan_exclusive_add(long x);
+long __ovld __conv work_group_scan_exclusive_min(long x);
+long __ovld __conv work_group_scan_exclusive_max(long x);
+long __ovld __conv work_group_scan_inclusive_add(long x);
+long __ovld __conv work_group_scan_inclusive_min(long x);
+long __ovld __conv work_group_scan_inclusive_max(long x);
+ulong __ovld __conv work_group_reduce_add(ulong x);
+ulong __ovld __conv work_group_reduce_min(ulong x);
+ulong __ovld __conv work_group_reduce_max(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_max(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_max(ulong x);
+float __ovld __conv work_group_reduce_add(float x);
+float __ovld __conv work_group_reduce_min(float x);
+float __ovld __conv work_group_reduce_max(float x);
+float __ovld __conv work_group_scan_exclusive_add(float x);
+float __ovld __conv work_group_scan_exclusive_min(float x);
+float __ovld __conv work_group_scan_exclusive_max(float x);
+float __ovld __conv work_group_scan_inclusive_add(float x);
+float __ovld __conv work_group_scan_inclusive_min(float x);
+float __ovld __conv work_group_scan_inclusive_max(float x);
 #ifdef cl_khr_fp64
-double __ovld work_group_reduce_add(double x);
-double __ovld work_group_reduce_min(double x);
-double __ovld work_group_reduce_max(double x);
-double __ovld work_group_scan_exclusive_add(double x);
-double __ovld work_group_scan_exclusive_min(double x);
-double __ovld work_group_scan_exclusive_max(double x);
-double __ovld work_group_scan_inclusive_add(double x);
-double __ovld work_group_scan_inclusive_min(double x);
-double __ovld work_group_scan_inclusive_max(double x);
+double __ovld __conv work_group_reduce_add(double x);
+double __ovld __conv work_group_reduce_min(double x);
+double __ovld __conv work_group_reduce_max(double x);
+double __ovld __conv work_group_scan_exclusive_add(double x);
+double __ovld __conv work_group_scan_exclusive_min(double x);
+double __ovld __conv work_group_scan_exclusive_max(double x);
+double __ovld __conv work_group_scan_inclusive_add(double x);
+double __ovld __conv work_group_scan_inclusive_min(double x);
+double __ovld __conv work_group_scan_inclusive_max(double x);
 #endif //cl_khr_fp64
 
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -16840,11 +16743,11 @@ void __ovld retain_event(clk_event_t);
 
 void __ovld release_event(clk_event_t);
 
-clk_event_t create_user_event(void);
+clk_event_t __ovld create_user_event(void);
 
 void __ovld set_user_event_status(clk_event_t e, int state);
 
-bool is_valid_event (clk_event_t event);
+bool __ovld is_valid_event (clk_event_t event);
 
 void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);
 
@@ -16864,96 +16767,286 @@ uint    __ovld get_enqueued_num_sub_groups(void);
 uint    __ovld get_sub_group_id(void);
 uint    __ovld get_sub_group_local_id(void);
 
-void    __ovld sub_group_barrier(cl_mem_fence_flags flags);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-void    __ovld sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
-int     __ovld sub_group_all(int predicate);
-int     __ovld sub_group_any(int predicate);
-
-int     __ovld sub_group_broadcast(int   x, uint sub_group_local_id);
-uint    __ovld sub_group_broadcast(uint  x, uint sub_group_local_id);
-long    __ovld sub_group_broadcast(long  x, uint sub_group_local_id);
-ulong   __ovld sub_group_broadcast(ulong x, uint sub_group_local_id);
-float   __ovld sub_group_broadcast(float x, uint sub_group_local_id);
-
-int     __ovld sub_group_reduce_add(int   x);
-uint    __ovld sub_group_reduce_add(uint  x);
-long    __ovld sub_group_reduce_add(long  x);
-ulong   __ovld sub_group_reduce_add(ulong x);
-float   __ovld sub_group_reduce_add(float x);
-int     __ovld sub_group_reduce_min(int   x);
-uint    __ovld sub_group_reduce_min(uint  x);
-long    __ovld sub_group_reduce_min(long  x);
-ulong   __ovld sub_group_reduce_min(ulong x);
-float   __ovld sub_group_reduce_min(float x);
-int     __ovld sub_group_reduce_max(int   x);
-uint    __ovld sub_group_reduce_max(uint  x);
-long    __ovld sub_group_reduce_max(long  x);
-ulong   __ovld sub_group_reduce_max(ulong x);
-float   __ovld sub_group_reduce_max(float x);
-
-int     __ovld sub_group_scan_exclusive_add(int   x);
-uint    __ovld sub_group_scan_exclusive_add(uint  x);
-long    __ovld sub_group_scan_exclusive_add(long  x);
-ulong   __ovld sub_group_scan_exclusive_add(ulong x);
-float   __ovld sub_group_scan_exclusive_add(float x);
-int     __ovld sub_group_scan_exclusive_min(int   x);
-uint    __ovld sub_group_scan_exclusive_min(uint  x);
-long    __ovld sub_group_scan_exclusive_min(long  x);
-ulong   __ovld sub_group_scan_exclusive_min(ulong x);
-float   __ovld sub_group_scan_exclusive_min(float x);
-int     __ovld sub_group_scan_exclusive_max(int   x);
-uint    __ovld sub_group_scan_exclusive_max(uint  x);
-long    __ovld sub_group_scan_exclusive_max(long  x);
-ulong   __ovld sub_group_scan_exclusive_max(ulong x);
-float   __ovld sub_group_scan_exclusive_max(float x);
-
-int     __ovld sub_group_scan_inclusive_add(int   x);
-uint    __ovld sub_group_scan_inclusive_add(uint  x);
-long    __ovld sub_group_scan_inclusive_add(long  x);
-ulong   __ovld sub_group_scan_inclusive_add(ulong x);
-float   __ovld sub_group_scan_inclusive_add(float x);
-int     __ovld sub_group_scan_inclusive_min(int   x);
-uint    __ovld sub_group_scan_inclusive_min(uint  x);
-long    __ovld sub_group_scan_inclusive_min(long  x);
-ulong   __ovld sub_group_scan_inclusive_min(ulong x);
-float   __ovld sub_group_scan_inclusive_min(float x);
-int     __ovld sub_group_scan_inclusive_max(int   x);
-uint    __ovld sub_group_scan_inclusive_max(uint  x);
-long    __ovld sub_group_scan_inclusive_max(long  x);
-ulong   __ovld sub_group_scan_inclusive_max(ulong x);
-float   __ovld sub_group_scan_inclusive_max(float x);
+int     __ovld __conv sub_group_all(int predicate);
+int     __ovld __conv sub_group_any(int predicate);
+
+int     __ovld __conv sub_group_broadcast(int   x, uint sub_group_local_id);
+uint    __ovld __conv sub_group_broadcast(uint  x, uint sub_group_local_id);
+long    __ovld __conv sub_group_broadcast(long  x, uint sub_group_local_id);
+ulong   __ovld __conv sub_group_broadcast(ulong x, uint sub_group_local_id);
+float   __ovld __conv sub_group_broadcast(float x, uint sub_group_local_id);
+
+int     __ovld __conv sub_group_reduce_add(int   x);
+uint    __ovld __conv sub_group_reduce_add(uint  x);
+long    __ovld __conv sub_group_reduce_add(long  x);
+ulong   __ovld __conv sub_group_reduce_add(ulong x);
+float   __ovld __conv sub_group_reduce_add(float x);
+int     __ovld __conv sub_group_reduce_min(int   x);
+uint    __ovld __conv sub_group_reduce_min(uint  x);
+long    __ovld __conv sub_group_reduce_min(long  x);
+ulong   __ovld __conv sub_group_reduce_min(ulong x);
+float   __ovld __conv sub_group_reduce_min(float x);
+int     __ovld __conv sub_group_reduce_max(int   x);
+uint    __ovld __conv sub_group_reduce_max(uint  x);
+long    __ovld __conv sub_group_reduce_max(long  x);
+ulong   __ovld __conv sub_group_reduce_max(ulong x);
+float   __ovld __conv sub_group_reduce_max(float x);
+
+int     __ovld __conv sub_group_scan_exclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_add(float x);
+int     __ovld __conv sub_group_scan_exclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_min(float x);
+int     __ovld __conv sub_group_scan_exclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_max(float x);
+
+int     __ovld __conv sub_group_scan_inclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_add(float x);
+int     __ovld __conv sub_group_scan_inclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_min(float x);
+int     __ovld __conv sub_group_scan_inclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_max(float x);
 
 #ifdef cl_khr_fp16
-half    __ovld sub_group_broadcast(half x, uint sub_group_local_id);
-half    __ovld sub_group_reduce_add(half x);
-half    __ovld sub_group_reduce_min(half x);
-half    __ovld sub_group_reduce_max(half x);
-half    __ovld sub_group_scan_exclusive_add(half x);
-half    __ovld sub_group_scan_exclusive_min(half x);
-half    __ovld sub_group_scan_exclusive_max(half x);
-half    __ovld sub_group_scan_inclusive_add(half x);
-half    __ovld sub_group_scan_inclusive_min(half x);
-half    __ovld sub_group_scan_inclusive_max(half x);
+half    __ovld __conv sub_group_broadcast(half x, uint sub_group_local_id);
+half    __ovld __conv sub_group_reduce_add(half x);
+half    __ovld __conv sub_group_reduce_min(half x);
+half    __ovld __conv sub_group_reduce_max(half x);
+half    __ovld __conv sub_group_scan_exclusive_add(half x);
+half    __ovld __conv sub_group_scan_exclusive_min(half x);
+half    __ovld __conv sub_group_scan_exclusive_max(half x);
+half    __ovld __conv sub_group_scan_inclusive_add(half x);
+half    __ovld __conv sub_group_scan_inclusive_min(half x);
+half    __ovld __conv sub_group_scan_inclusive_max(half x);
 #endif //cl_khr_fp16
 
 #ifdef cl_khr_fp64
-double  __ovld sub_group_broadcast(double x, uint sub_group_local_id);
-double  __ovld sub_group_reduce_add(double x);
-double  __ovld sub_group_reduce_min(double x);
-double  __ovld sub_group_reduce_max(double x);
-double  __ovld sub_group_scan_exclusive_add(double x);
-double  __ovld sub_group_scan_exclusive_min(double x);
-double  __ovld sub_group_scan_exclusive_max(double x);
-double  __ovld sub_group_scan_inclusive_add(double x);
-double  __ovld sub_group_scan_inclusive_min(double x);
-double  __ovld sub_group_scan_inclusive_max(double x);
+double  __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id);
+double  __ovld __conv sub_group_reduce_add(double x);
+double  __ovld __conv sub_group_reduce_min(double x);
+double  __ovld __conv sub_group_reduce_max(double x);
+double  __ovld __conv sub_group_scan_exclusive_add(double x);
+double  __ovld __conv sub_group_scan_exclusive_min(double x);
+double  __ovld __conv sub_group_scan_exclusive_max(double x);
+double  __ovld __conv sub_group_scan_inclusive_add(double x);
+double  __ovld __conv sub_group_scan_inclusive_min(double x);
+double  __ovld __conv sub_group_scan_inclusive_max(double x);
 #endif //cl_khr_fp64
 
 #endif //cl_khr_subgroups cl_intel_subgroups
 
+#ifdef cl_amd_media_ops
+uint __ovld amd_bitalign(uint a, uint b, uint c);
+uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bitalign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bitalign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bitalign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bitalign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_bytealign(uint a, uint b, uint c);
+uint2 __ovld amd_bytealign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bytealign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bytealign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bytealign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bytealign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_lerp(uint a, uint b, uint c);
+uint2 __ovld amd_lerp(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_lerp(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_lerp(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_lerp(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_lerp(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_pack(float4 v);
+
+uint __ovld amd_sad4(uint4 x, uint4 y, uint z);
+
+uint __ovld amd_sadhi(uint a, uint b, uint c);
+uint2 __ovld amd_sadhi(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sadhi(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sadhi(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sadhi(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sadhi(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_sad(uint a, uint b, uint c);
+uint2 __ovld amd_sad(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sad(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sad(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sad(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sad(uint16 a, uint16 b, uint16 c);
+
+float __ovld amd_unpack0(uint a);
+float2 __ovld amd_unpack0(uint2 a);
+float3 __ovld amd_unpack0(uint3 a);
+float4 __ovld amd_unpack0(uint4 a);
+float8 __ovld amd_unpack0(uint8 a);
+float16 __ovld amd_unpack0(uint16 a);
+
+float __ovld amd_unpack1(uint a);
+float2 __ovld amd_unpack1(uint2 a);
+float3 __ovld amd_unpack1(uint3 a);
+float4 __ovld amd_unpack1(uint4 a);
+float8 __ovld amd_unpack1(uint8 a);
+float16 __ovld amd_unpack1(uint16 a);
+
+float __ovld amd_unpack2(uint a);
+float2 __ovld amd_unpack2(uint2 a);
+float3 __ovld amd_unpack2(uint3 a);
+float4 __ovld amd_unpack2(uint4 a);
+float8 __ovld amd_unpack2(uint8 a);
+float16 __ovld amd_unpack2(uint16 a);
+
+float __ovld amd_unpack3(uint a);
+float2 __ovld amd_unpack3(uint2 a);
+float3 __ovld amd_unpack3(uint3 a);
+float4 __ovld amd_unpack3(uint4 a);
+float8 __ovld amd_unpack3(uint8 a);
+float16 __ovld amd_unpack3(uint16 a);
+#endif // cl_amd_media_ops
+
+#ifdef cl_amd_media_ops2
+int __ovld amd_bfe(int src0, uint src1, uint src2);
+int2 __ovld amd_bfe(int2 src0, uint2 src1, uint2 src2);
+int3 __ovld amd_bfe(int3 src0, uint3 src1, uint3 src2);
+int4 __ovld amd_bfe(int4 src0, uint4 src1, uint4 src2);
+int8 __ovld amd_bfe(int8 src0, uint8 src1, uint8 src2);
+int16 __ovld amd_bfe(int16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfe(uint src0, uint src1, uint src2);
+uint2 __ovld amd_bfe(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_bfe(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_bfe(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_bfe(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_bfe(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfm(uint src0, uint src1);
+uint2 __ovld amd_bfm(uint2 src0, uint2 src1);
+uint3 __ovld amd_bfm(uint3 src0, uint3 src1);
+uint4 __ovld amd_bfm(uint4 src0, uint4 src1);
+uint8 __ovld amd_bfm(uint8 src0, uint8 src1);
+uint16 __ovld amd_bfm(uint16 src0, uint16 src1);
+
+float __ovld amd_max3(float src0, float src1, float src2);
+float2 __ovld amd_max3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_max3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_max3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_max3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_max3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_max3(int src0, int src1, int src2);
+int2 __ovld amd_max3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_max3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_max3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_max3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_max3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_max3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_max3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_max3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_max3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_max3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_max3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_median3(float src0, float src1, float src2);
+float2 __ovld amd_median3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_median3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_median3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_median3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_median3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_median3(int src0, int src1, int src2);
+int2 __ovld amd_median3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_median3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_median3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_median3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_median3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_median3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_median3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_median3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_median3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_median3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_median3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_min3(float src0, float src1, float src);
+float2 __ovld amd_min3(float2 src0, float2 src1, float2 src);
+float3 __ovld amd_min3(float3 src0, float3 src1, float3 src);
+float4 __ovld amd_min3(float4 src0, float4 src1, float4 src);
+float8 __ovld amd_min3(float8 src0, float8 src1, float8 src);
+float16 __ovld amd_min3(float16 src0, float16 src1, float16 src);
+
+int __ovld amd_min3(int src0, int src1, int src2);
+int2 __ovld amd_min3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_min3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_min3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_min3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_min3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_min3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_min3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_min3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_min3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_min3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_min3(uint16 src0, uint16 src1, uint16 src2);
+
+ulong __ovld amd_mqsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_mqsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_mqsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_mqsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_mqsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_mqsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+ulong __ovld amd_qsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_qsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_qsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_qsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_qsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_qsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+uint __ovld amd_msad(uint src0, uint src1, uint src2);
+uint2 __ovld amd_msad(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_msad(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_msad(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_msad(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_msad(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadd(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadd(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadd(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadd(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadd(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadd(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadw(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadw(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadw(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadw(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
+#endif // cl_amd_media_ops2
+
 // Disable any extensions we may have enabled previously.
 #pragma OPENCL EXTENSION all : disable
 
diff --git a/lib/Headers/pmmintrin.h b/lib/Headers/pmmintrin.h
index 5b1058069c44..d4f6487af179 100644
--- a/lib/Headers/pmmintrin.h
+++ b/lib/Headers/pmmintrin.h
@@ -37,7 +37,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VLDDQU instruction.
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit integer vector containing integer values.
@@ -53,7 +53,7 @@ _mm_lddqu_si128(__m128i const *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSUBPS instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the left source operand.
@@ -72,7 +72,7 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHADDPS instruction.
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -95,7 +95,7 @@ _mm_hadd_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHSUBPS instruction.
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -115,18 +115,18 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
 
 /// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
 ///    vector of [4 x float] to float values stored in a 128-bit vector of
-///    [4 x float].
-///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
-///    the destination.
-///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
-///    destination.
+///    [4 x float]. 
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSHDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
 ///
 /// \param __a
-///    A 128-bit vector of [4 x float].
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
 ///    values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -135,20 +135,19 @@ _mm_movehdup_ps(__m128 __a)
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
 }
 
-/// \brief Duplicates low-order (even-indexed) values from a 128-bit
-///    vector of [4 x float] to float values stored in a 128-bit vector of
-///    [4 x float].
-///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
-///    the destination.
-///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
-///    destination.
+/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
+///    [4 x float] to float values stored in a 128-bit vector of [4 x float]. 
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSLDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
 ///
 /// \param __a
-///    A 128-bit vector of [4 x float].
+///    A 128-bit vector of [4 x float] \n
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
 ///    values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -162,7 +161,7 @@ _mm_moveldup_ps(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSUBPD instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double] containing the left source operand.
@@ -181,7 +180,7 @@ _mm_addsub_pd(__m128d __a, __m128d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHADDPD instruction.
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double] containing one of the source operands.
@@ -204,7 +203,7 @@ _mm_hadd_pd(__m128d __a, __m128d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHSUBPD instruction.
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double] containing one of the source operands.
@@ -231,7 +230,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
 /// __m128d _mm_loaddup_pd(double const * dp);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VMOVDDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
 ///
 /// \param dp
 ///    A pointer to a double-precision value to be moved and duplicated.
@@ -245,7 +244,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVDDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
@@ -272,7 +271,7 @@ _mm_movedup_pd(__m128d __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MONITOR instruction.
+/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
 ///
 /// \param __p
 ///    The memory range to be monitored. The size of the range is determined by
@@ -293,7 +292,7 @@ _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MWAIT instruction.
+/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
 ///
 /// \param __extensions
 ///    Optional extensions for the monitoring state, which may vary by
diff --git a/lib/Headers/popcntintrin.h b/lib/Headers/popcntintrin.h
index 7e2f1670805f..0b4793e58bcb 100644
--- a/lib/Headers/popcntintrin.h
+++ b/lib/Headers/popcntintrin.h
@@ -31,7 +31,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
 ///
 /// \param __A
 ///    An unsigned 32-bit integer operand.
@@ -47,7 +47,7 @@ _mm_popcnt_u32(unsigned int __A)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
 ///
 /// \param __A
 ///    A signed 32-bit integer operand.
@@ -64,7 +64,7 @@ _popcnt32(int __A)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
 ///
 /// \param __A
 ///    An unsigned 64-bit integer operand.
@@ -80,7 +80,7 @@ _mm_popcnt_u64(unsigned long long __A)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
 ///
 /// \param __A
 ///    A signed 64-bit integer operand.
diff --git a/lib/Headers/stdatomic.h b/lib/Headers/stdatomic.h
index e03798766014..23bb3a357768 100644
--- a/lib/Headers/stdatomic.h
+++ b/lib/Headers/stdatomic.h
@@ -45,11 +45,11 @@ extern "C" {
 #define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
 #define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
 #define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-#define ATOMIC_SHORT_T_LOCK_FREE    __GCC_ATOMIC_SHORT_T_LOCK_FREE
-#define ATOMIC_INT_T_LOCK_FREE      __GCC_ATOMIC_INT_T_LOCK_FREE
-#define ATOMIC_LONG_T_LOCK_FREE     __GCC_ATOMIC_LONG_T_LOCK_FREE
-#define ATOMIC_LLONG_T_LOCK_FREE    __GCC_ATOMIC_LLONG_T_LOCK_FREE
-#define ATOMIC_POINTER_T_LOCK_FREE  __GCC_ATOMIC_POINTER_T_LOCK_FREE
+#define ATOMIC_SHORT_LOCK_FREE      __GCC_ATOMIC_SHORT_LOCK_FREE
+#define ATOMIC_INT_LOCK_FREE        __GCC_ATOMIC_INT_LOCK_FREE
+#define ATOMIC_LONG_LOCK_FREE       __GCC_ATOMIC_LONG_LOCK_FREE
+#define ATOMIC_LLONG_LOCK_FREE      __GCC_ATOMIC_LLONG_LOCK_FREE
+#define ATOMIC_POINTER_LOCK_FREE    __GCC_ATOMIC_POINTER_LOCK_FREE
 
 /* 7.17.2 Initialization */
 
diff --git a/lib/Headers/tmmintrin.h b/lib/Headers/tmmintrin.h
index a72796ba4a68..80664043a06f 100644
--- a/lib/Headers/tmmintrin.h
+++ b/lib/Headers/tmmintrin.h
@@ -483,15 +483,15 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 128-bit integer vector containing the second source operand.
 /// \returns A 128-bit integer vector containing the sums of products of both
-///    operands:
-///    R0 := (__a0 * __b0) + (__a1 * __b1)
-///    R1 := (__a2 * __b2) + (__a3 * __b3)
-///    R2 := (__a4 * __b4) + (__a5 * __b5)
-///    R3 := (__a6 * __b6) + (__a7 * __b7)
-///    R4 := (__a8 * __b8) + (__a9 * __b9)
-///    R5 := (__a10 * __b10) + (__a11 * __b11)
-///    R6 := (__a12 * __b12) + (__a13 * __b13)
-///    R7 := (__a14 * __b14) + (__a15 * __b15)
+///    operands: \n
+///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
+///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
+///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
+///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
+///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 {
@@ -516,11 +516,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 /// \param __b
 ///    A 64-bit integer vector containing the second source operand.
 /// \returns A 64-bit integer vector containing the sums of products of both
-///    operands:
-///    R0 := (__a0 * __b0) + (__a1 * __b1)
-///    R1 := (__a2 * __b2) + (__a3 * __b3)
-///    R2 := (__a4 * __b4) + (__a5 * __b5)
-///    R3 := (__a6 * __b6) + (__a7 * __b7)
+///    operands: \n
+///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
@@ -580,11 +580,11 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 128-bit integer vector containing control bytes corresponding to
 ///    positions in the destination:
-///    Bit 7:
-///    1: Clear the corresponding byte in the destination.
+///    Bit 7: \n
+///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
-///    destination.
-///    Bits [6:4] Reserved.
+///    destination. \n
+///    Bits [6:4] Reserved.  \n
 ///    Bits [3:0] select the source byte to be copied.
 /// \returns A 128-bit integer vector containing the copied or cleared values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -606,10 +606,10 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 /// \param __b
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination:
-///    Bit 7:
-///    1: Clear the corresponding byte in the destination.
+///    Bit 7: \n
+///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
-///    destination.
+///    destination. \n
 ///    Bits [3:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index 99cddb0fac82..dc31b85cfd7c 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -46,7 +46,7 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
+/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -69,7 +69,7 @@ _mm_add_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
+/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -88,7 +88,7 @@ _mm_add_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
+/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
@@ -112,7 +112,7 @@ _mm_sub_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
+/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the minuend.
@@ -131,7 +131,7 @@ _mm_sub_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
+/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -154,7 +154,7 @@ _mm_mul_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
+/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -173,7 +173,7 @@ _mm_mul_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
+/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
@@ -195,7 +195,7 @@ _mm_div_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
+/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the dividend.
@@ -214,7 +214,7 @@ _mm_div_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
+/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -233,7 +233,7 @@ _mm_sqrt_ss(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
+/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -250,7 +250,7 @@ _mm_sqrt_ps(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
+/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -269,7 +269,7 @@ _mm_rcp_ss(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
+/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -286,7 +286,7 @@ _mm_rcp_ps(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
+/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -306,7 +306,7 @@ _mm_rsqrt_ss(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
+/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -324,7 +324,7 @@ _mm_rsqrt_ps(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
+/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -341,12 +341,12 @@ _mm_min_ss(__m128 __a, __m128 __b)
   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 128-bit vectors of [4 x float] and returns the
-///    lesser of each pair of values.
+/// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser
+///    of each pair of values.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
+/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands.
@@ -361,12 +361,12 @@ _mm_min_ps(__m128 __a, __m128 __b)
 }
 
 /// \brief Compares two 32-bit float values in the low-order bits of both
-///    operands and returns the greater value in the low-order bits of
-///    a vector [4 x float].
+///    operands and returns the greater value in the low-order bits of a 128-bit
+///    vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
+/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -388,7 +388,7 @@ _mm_max_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
+/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands.
@@ -406,7 +406,7 @@ _mm_max_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
+/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector containing one of the source operands.
@@ -426,7 +426,7 @@ _mm_and_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
+/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the first source operand. The
@@ -446,7 +446,7 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VORPS / ORPS instructions.
+/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -465,7 +465,7 @@ _mm_or_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -485,7 +485,7 @@ _mm_xor_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
+/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -506,7 +506,7 @@ _mm_cmpeq_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
+/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -526,7 +526,7 @@ _mm_cmpeq_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -548,7 +548,7 @@ _mm_cmplt_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -569,7 +569,7 @@ _mm_cmplt_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -591,7 +591,7 @@ _mm_cmple_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -611,7 +611,7 @@ _mm_cmple_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -635,7 +635,7 @@ _mm_cmpgt_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -656,7 +656,7 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -680,7 +680,7 @@ _mm_cmpge_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -699,7 +699,8 @@ _mm_cmpge_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -720,7 +721,8 @@ _mm_cmpneq_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -740,7 +742,8 @@ _mm_cmpneq_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -762,7 +765,8 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -783,7 +787,8 @@ _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -805,7 +810,8 @@ _mm_cmpnle_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -826,7 +832,8 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -850,7 +857,8 @@ _mm_cmpngt_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -871,7 +879,8 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -895,7 +904,8 @@ _mm_cmpnge_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -916,7 +926,8 @@ _mm_cmpnge_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
+/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -938,7 +949,8 @@ _mm_cmpord_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
+/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -959,7 +971,8 @@ _mm_cmpord_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
+/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -981,7 +994,8 @@ _mm_cmpunord_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
+/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -999,7 +1013,8 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1020,7 +1035,8 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1041,7 +1057,7 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1062,7 +1078,7 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1083,7 +1099,7 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1104,7 +1120,7 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1125,7 +1141,7 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1146,7 +1162,7 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1162,13 +1178,13 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 }
 
 /// \brief Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine if the first operand
-///    is less than or equal to the second operand and returns the result of
-///    the comparison.
+///    the low-order bits of both operands to determine if the first operand is
+///    less than or equal to the second operand and returns the result of the
+///    comparison.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1184,13 +1200,13 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 }
 
 /// \brief Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine if the first operand
-///    is greater than the second operand and returns the result of the
+///    the low-order bits of both operands to determine if the first operand is
+///    greater than the second operand and returns the result of the
 ///    comparison.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1212,7 +1228,7 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1233,7 +1249,7 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1253,7 +1269,8 @@ _mm_ucomineq_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1270,7 +1287,8 @@ _mm_cvtss_si32(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1289,7 +1307,8 @@ _mm_cvt_ss2si(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1308,7 +1327,7 @@ _mm_cvtss_si64(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1324,7 +1343,7 @@ _mm_cvtps_pi32(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1341,7 +1360,8 @@ _mm_cvt_ps2pi(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1359,7 +1379,8 @@ _mm_cvttss_si32(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1371,13 +1392,15 @@ _mm_cvtt_ss2si(__m128 __a)
   return _mm_cvttss_si32(__a);
 }
 
+#ifdef __x86_64__
 /// \brief Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 64-bit integer, truncating the result when it is
 ///    inexact.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1388,6 +1411,7 @@ _mm_cvttss_si64(__m128 __a)
 {
   return __builtin_ia32_cvttss2si64((__v4sf)__a);
 }
+#endif
 
 /// \brief Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
@@ -1395,7 +1419,8 @@ _mm_cvttss_si64(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
+/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1412,7 +1437,7 @@ _mm_cvttps_pi32(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1430,7 +1455,7 @@ _mm_cvtt_ps2pi(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1453,7 +1478,7 @@ _mm_cvtsi32_ss(__m128 __a, int __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1477,7 +1502,7 @@ _mm_cvt_si2ss(__m128 __a, int __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1502,7 +1527,7 @@ _mm_cvtsi64_ss(__m128 __a, long long __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1525,7 +1550,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1546,7 +1571,7 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1558,13 +1583,13 @@ _mm_cvtss_f32(__m128 __a)
   return __a[0];
 }
 
-/// \brief Loads two packed float values from the address __p into the
+/// \brief Loads two packed float values from the address \a __p into the
 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
 ///     are copied from the low-order bits of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
@@ -1585,13 +1610,13 @@ _mm_loadh_pi(__m128 __a, const __m64 *__p)
   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
 }
 
-/// \brief Loads two packed float values from the address __p into the low-order
-///    bits of a 128-bit vector of [4 x float]. The high-order bits are copied
-///    from the high-order bits of the first operand.
+/// \brief Loads two packed float values from the address \a __p into the
+///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
+///    are copied from the high-order bits of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
@@ -1619,7 +1644,7 @@ _mm_loadl_pi(__m128 __a, const __m64 *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 32-bit memory location containing a single-precision
@@ -1642,13 +1667,13 @@ _mm_load_ss(const float *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
 ///    A pointer to a float value to be loaded and duplicated.
-/// \returns A 128-bit vector of [4 x float] containing the loaded
-///    and duplicated values.
+/// \returns A 128-bit vector of [4 x float] containing the loaded and
+///    duplicated values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load1_ps(const float *__p)
 {
@@ -1666,7 +1691,7 @@ _mm_load1_ps(const float *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit memory location. The address of the memory
@@ -1683,7 +1708,7 @@ _mm_load_ps(const float *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit memory location. The address of the memory
@@ -1703,7 +1728,7 @@ _mm_loadu_ps(const float *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
@@ -1725,7 +1750,6 @@ _mm_loadr_ps(const float *__p)
 /// This intrinsic has no corresponding instruction.
 ///
 /// \returns A 128-bit vector of [4 x float] containing undefined values.
-
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_undefined_ps(void)
 {
@@ -1738,7 +1762,7 @@ _mm_undefined_ps(void)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
 ///
 /// \param __w
 ///    A single-precision floating-point value used to initialize the lower 32
@@ -1758,7 +1782,7 @@ _mm_set_ss(float __w)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
 ///
 /// \param __w
 ///    A single-precision floating-point value used to initialize each vector
@@ -1777,7 +1801,7 @@ _mm_set1_ps(float __w)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
 ///
 /// \param __w
 ///    A single-precision floating-point value used to initialize each vector
@@ -1849,7 +1873,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
 ///
 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
 ///    all elements set to zero.
@@ -1864,7 +1888,7 @@ _mm_setzero_ps(void)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 64-bit memory location.
@@ -1881,7 +1905,7 @@ _mm_storeh_pi(__m64 *__p, __m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a memory location that will receive the float values.
@@ -1898,7 +1922,7 @@ _mm_storel_pi(__m64 *__p, __m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 32-bit memory location.
@@ -1913,12 +1937,12 @@ _mm_store_ss(float *__p, __m128 __a)
   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
 }
 
-/// \brief Stores float values from a 128-bit vector of [4 x float] to an
-///    unaligned memory location.
+/// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory
+///    location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit memory location. The address of the memory
@@ -1934,19 +1958,18 @@ _mm_storeu_ps(float *__p, __m128 __a)
   ((struct __storeu_ps*)__p)->__v = __a;
 }
 
-/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
-///    four contiguous elements in an aligned memory location.
+/// \brief Stores a 128-bit vector of [4 x float] into an aligned memory
+///    location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
-///    instruction.
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
 ///
 /// \param __p
-///    A pointer to a 128-bit memory location.
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
 /// \param __a
-///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-///    of the four contiguous elements pointed by __p.
+///    A 128-bit vector of [4 x float] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps(float *__p, __m128 __a)
 {
@@ -1958,14 +1981,14 @@ _mm_store_ps(float *__p, __m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit memory location.
 /// \param __a
 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-///    of the four contiguous elements pointed by __p.
+///    of the four contiguous elements pointed by \a __p.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_ps(float *__p, __m128 __a)
 {
@@ -1973,18 +1996,19 @@ _mm_store1_ps(float *__p, __m128 __a)
   _mm_store_ps(__p, __a);
 }
 
-/// \brief Stores float values from a 128-bit vector of [4 x float] to an
-///    aligned memory location.
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+///    four contiguous elements in an aligned memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
 ///
 /// \param __p
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 128-bit aligned.
+///    A pointer to a 128-bit memory location.
 /// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be stored.
+///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+///    of the four contiguous elements pointed by \a __p.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps1(float *__p, __m128 __a)
 {
@@ -1996,7 +2020,7 @@ _mm_store_ps1(float *__p, __m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
@@ -2029,20 +2053,21 @@ _mm_storer_ps(float *__p, __m128 __a)
 /// void _mm_prefetch(const void * a, const int sel);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c PREFETCHNTA instruction.
+/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
 ///
 /// \param a
 ///    A pointer to a memory location containing a cache line of data.
 /// \param sel
-///    A predefined integer constant specifying the type of prefetch operation:
-///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
-///    The PREFETCHNTA instruction will be generated.
+///    A predefined integer constant specifying the type of prefetch
+///    operation: \n
+///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
+///    PREFETCHNTA instruction will be generated. \n
 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
-///    be generated.
+///    be generated. \n
 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
-///    be generated.
+///    be generated. \n
 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
-///    be generated.
+///    be generated.                                   
 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
 #endif
 
@@ -2052,7 +2077,7 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MOVNTQ instruction.
+/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
 ///
 /// \param __p
 ///    A pointer to an aligned memory location used to store the register value.
@@ -2070,7 +2095,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit aligned memory location that will receive the
@@ -2083,6 +2108,10 @@ _mm_stream_ps(float *__p, __m128 __a)
   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
 }
 
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 /// \brief Forces strong memory ordering (serialization) between store
 ///    instructions preceding this instruction and store instructions following
 ///    this instruction, ensuring the system completes all previous stores
@@ -2090,28 +2119,32 @@ _mm_stream_ps(float *__p, __m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c SFENCE instruction.
+/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
 ///
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_sfence(void)
-{
-  __builtin_ia32_sfence();
-}
+void _mm_sfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
 
 /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
 ///    returns it, as specified by the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
+/// \code
+/// void _mm_extract_pi(__m64 a, int n);
+/// \endcode
 ///
-/// \param __a
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param a
 ///    A 64-bit vector of [4 x i16].
-/// \param __n
-///    An immediate integer operand that determines which bits are extracted:
-///    0: Bits [15:0] are copied to the destination.
-///    1: Bits [31:16] are copied to the destination.
-///    2: Bits [47:32] are copied to the destination.
+/// \param n
+///    An immediate integer operand that determines which bits are extracted: \n
+///    0: Bits [15:0] are copied to the destination. \n
+///    1: Bits [31:16] are copied to the destination. \n
+///    2: Bits [47:32] are copied to the destination. \n
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) __extension__ ({ \
@@ -2119,26 +2152,30 @@ _mm_sfence(void)
 
 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
-///    specified by the immediate operand __n.
+///    specified by the immediate operand \a n.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
+/// \code
+/// void _mm_insert_pi(__m64 a, int d, int n);
+/// \endcode
 ///
-/// \param __a
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param a
 ///    A 64-bit vector of [4 x i16].
-/// \param __d
+/// \param d
 ///    An integer. The lower 16-bit value from this operand is written to the
-///    destination at the offset specified by operand __n.
-/// \param __n
+///    destination at the offset specified by operand \a n.
+/// \param n
 ///    An immediate integer operant that determines which the bits to be used
-///    in the destination.
-///    0: Bits [15:0] are copied to the destination.
-///    1: Bits [31:16] are copied to the destination.
-///    2: Bits [47:32] are copied to the destination.
-///    3: Bits [63:48] are copied to the destination.
+///    in the destination. \n
+///    0: Bits [15:0] are copied to the destination. \n
+///    1: Bits [31:16] are copied to the destination. \n
+///    2: Bits [47:32] are copied to the destination. \n
+///    3: Bits [63:48] are copied to the destination.  \n
 ///    The remaining bits in the destination are copied from the corresponding
-///    bits in operand __a.
+///    bits in operand \a a.
 /// \returns A 64-bit integer vector containing the copied packed data from the
 ///    operands.
 #define _mm_insert_pi16(a, d, n) __extension__ ({ \
@@ -2150,7 +2187,7 @@ _mm_sfence(void)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMAXSW instruction.
+/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2169,7 +2206,7 @@ _mm_max_pi16(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMAXUB instruction.
+/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2188,7 +2225,7 @@ _mm_max_pu8(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMINSW instruction.
+/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2207,7 +2244,7 @@ _mm_min_pi16(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMINUB instruction.
+/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2226,7 +2263,7 @@ _mm_min_pu8(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMOVMSKB instruction.
+/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing the values with bits to be extracted.
@@ -2244,7 +2281,7 @@ _mm_movemask_pi8(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMULHUW instruction.
+/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2262,27 +2299,31 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSHUFW instruction.
-///
 /// \code
 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
 /// \endcode
 ///
+/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
+///
 /// \param a
 ///    A 64-bit integer vector containing the values to be shuffled.
 /// \param n
 ///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from a. The destinations within the 64-bit destination are assigned
-///    values as follows:
-///    Bits [1:0] are used to assign values to bits [15:0] in the destination.
-///    Bits [3:2] are used to assign values to bits [31:16] in the destination.
-///    Bits [5:4] are used to assign values to bits [47:32] in the destination.
-///    Bits [7:6] are used to assign values to bits [63:48] in the destination.
-///    Bit value assignments:
-///    00: assigned from bits [15:0] of a.
-///    01: assigned from bits [31:16] of a.
-///    10: assigned from bits [47:32] of a.
-///    11: assigned from bits [63:48] of a.
+///    copy from \a a. The destinations within the 64-bit destination are
+///    assigned values as follows: \n
+///    Bits [1:0] are used to assign values to bits [15:0] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [31:16] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [47:32] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [63:48] in the
+///    destination. \n
+///    Bit value assignments: \n
+///    00: assigned from bits [15:0] of \a a. \n
+///    01: assigned from bits [31:16] of \a a. \n
+///    10: assigned from bits [47:32] of \a a. \n
+///    11: assigned from bits [63:48] of \a a.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
@@ -2295,15 +2336,15 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MASKMOVQ instruction.
+/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
 ///
 /// \param __d
 ///    A 64-bit integer vector containing the values with elements to be copied.
 /// \param __n
 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
-///    element determines whether the corresponding element in operand __d is
-///    copied. If the most significant bit of a given element is 1, the
-///    corresponding element in operand __d is copied.
+///    element determines whether the corresponding element in operand \a __d
+///    is copied. If the most significant bit of a given element is 1, the
+///    corresponding element in operand \a __d is copied.
 /// \param __p
 ///    A pointer to a 64-bit memory location that will receive the conditionally
 ///    copied integer values. The address of the memory location does not have
@@ -2320,7 +2361,7 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PAVGB instruction.
+/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2339,7 +2380,7 @@ _mm_avg_pu8(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PAVGW instruction.
+/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2359,7 +2400,7 @@ _mm_avg_pu16(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSADBW instruction.
+/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2374,24 +2415,42 @@ _mm_sad_pu8(__m64 __a, __m64 __b)
   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
 }
 
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
-///    integer value. There are several groups of macros associated with this
+///    integer value.
+///
+///    There are several groups of macros associated with this
 ///    intrinsic, including:
-///    * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+///    <ul>
+///    <li>
+///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
 ///      _MM_GET_EXCEPTION_STATE().
-///    * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///    </li>
+///    <li>
+///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
-///    * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///    </li>            
+///    <li>
+///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
 ///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
-///    * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///    </li>
+///    <li> 
+///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
-///    * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///    </li>
+///    <li> 
+///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
 ///      _MM_GET_DENORMALS_ZERO_MODE().
+///    </li>
+///    </ul>
 ///
 ///    For example, the expression below checks if an overflow exception has
 ///    occurred:
@@ -2402,35 +2461,45 @@ _mm_sad_pu8(__m64 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
+/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
 ///
 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
 ///    register.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_getcsr(void)
-{
-  return __builtin_ia32_stmxcsr();
-}
-
-/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
-///    are several groups of macros associated with this intrinsic, including:
-///    * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+unsigned int _mm_getcsr(void);
+
+/// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
+///   
+///    There are several groups of macros associated with this intrinsic,
+///    including:
+///    <ul>
+///    <li> 
+///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
-///    * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///    </li>
+///    <li>
+///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
 ///      of these macros.
-///    * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///    </li>
+///    <li>
+///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
-///    * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///    </li>
+///    <li>
+///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
 ///      one of these macros.
-///    * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///    </li>
+///    <li>
+///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
+///    </li>
+///    </ul>
 ///
 ///    For example, the following expression causes subsequent floating-point
 ///    operations to round up:
@@ -2444,15 +2513,15 @@ _mm_getcsr(void)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
+/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
 ///
 /// \param __i
 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_setcsr(unsigned int __i)
-{
-  __builtin_ia32_ldmxcsr(__i);
-}
+void _mm_setcsr(unsigned int);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
 
 /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
 ///    specified by the immediate value operand.
@@ -2463,7 +2532,7 @@ _mm_setcsr(unsigned int __i)
 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
+/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [4 x float].
@@ -2471,18 +2540,23 @@ _mm_setcsr(unsigned int __i)
 ///    A 128-bit vector of [4 x float].
 /// \param mask
 ///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from a and b.
-///    Bits [3:0] specify the values copied from operand a.
-///    Bits [7:4] specify the values copied from operand b. The destinations
-///    within the 128-bit destination are assigned values as follows:
-///    Bits [1:0] are used to assign values to bits [31:0] in the destination.
-///    Bits [3:2] are used to assign values to bits [63:32] in the destination.
-///    Bits [5:4] are used to assign values to bits [95:64] in the destination.
-///    Bits [7:6] are used to assign values to bits [127:96] in the destination.
-///    Bit value assignments:
-///    00: Bits [31:0] copied from the specified operand.
-///    01: Bits [63:32] copied from the specified operand.
-///    10: Bits [95:64] copied from the specified operand.
+///    copy from \ a and \a b. \n
+///    Bits [3:0] specify the values copied from operand \a a. \n
+///    Bits [7:4] specify the values copied from operand \a b. \n
+///    The destinations within the 128-bit destination are assigned values as
+///    follows: \n
+///    Bits [1:0] are used to assign values to bits [31:0] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [63:32] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [95:64] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [127:96] in the
+///    destination. \n
+///    Bit value assignments: \n
+///    00: Bits [31:0] copied from the specified operand. \n
+///    01: Bits [63:32] copied from the specified operand. \n
+///    10: Bits [95:64] copied from the specified operand. \n
 ///    11: Bits [127:96] copied from the specified operand.
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
@@ -2493,20 +2567,19 @@ _mm_setcsr(unsigned int __i)
                                   4 + (((mask) >> 6) & 0x3)); })
 
 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
-///    [4 x float] and interleaves them into a 128-bit vector of [4 x
-///    float].
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
+/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
 ///
 /// \param __a
-///    A 128-bit vector of [4 x float].
-///    Bits [95:64] are written to bits [31:0] of the destination.
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [95:64] are written to bits [31:0] of the destination. \n
 ///    Bits [127:96] are written to bits [95:64] of the destination.
 /// \param __b
 ///    A 128-bit vector of [4 x float].
-///    Bits [95:64] are written to bits [63:32] of the destination.
+///    Bits [95:64] are written to bits [63:32] of the destination. \n
 ///    Bits [127:96] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2516,20 +2589,19 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
 }
 
 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
-///    [4 x float] and interleaves them into a 128-bit vector of [4 x
-///    float].
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
+/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
 ///
 /// \param __a
-///    A 128-bit vector of [4 x float].
-///    Bits [31:0] are written to bits [31:0] of the destination.
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [31:0] are written to bits [31:0] of the destination.  \n
 ///    Bits [63:32] are written to bits [95:64] of the destination.
 /// \param __b
-///    A 128-bit vector of [4 x float].
-///    Bits [31:0] are written to bits [63:32] of the destination.
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [31:0] are written to bits [63:32] of the destination. \n
 ///    Bits [63:32] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2544,7 +2616,7 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
@@ -2565,7 +2637,7 @@ _mm_move_ss(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
@@ -2586,7 +2658,7 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
@@ -2606,7 +2678,8 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
@@ -2636,7 +2709,8 @@ _mm_cvtpi16_ps(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
@@ -2665,7 +2739,8 @@ _mm_cvtpu16_ps(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
@@ -2689,7 +2764,8 @@ _mm_cvtpi8_ps(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
@@ -2713,7 +2789,8 @@ _mm_cvtpu8_ps(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
@@ -2741,12 +2818,13 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 ///    packs the results into a 64-bit integer vector of [4 x i16]. If the
 ///    floating-point element is NaN or infinity, or if the floating-point
 ///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
-///    to 0x8000. Otherwise if the floating-point element is greater
-///    than 0x7FFF, it is converted to 0x7FFF.
+///    to 0x8000. Otherwise if the floating-point element is greater than
+///    0x7FFF, it is converted to 0x7FFF.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float].
@@ -2770,12 +2848,13 @@ _mm_cvtps_pi16(__m128 __a)
 ///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
 ///    floating-point element is NaN or infinity, or if the floating-point
 ///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
-///    to 0x80. Otherwise if the floating-point element is greater
-///    than 0x7F, it is converted to 0x7F.
+///    to 0x80. Otherwise if the floating-point element is greater than 0x7F,
+///    it is converted to 0x7F.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    128-bit floating-point vector of [4 x float].
@@ -2799,7 +2878,7 @@ _mm_cvtps_pi8(__m128 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
+/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float].