82 files changed, 8641 insertions, 0 deletions
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c
new file mode 100644
index 000000000000..7873a07e6f56
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision vector acos(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t poly[12];
+  float64x2_t pi, pi_over_2;
+  uint64x2_t abs_mask;
+} data = {
+  /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+     on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
+  .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
+	    V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
+	    V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
+	    V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
+	    V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
+	    V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
+  .pi = V2 (0x1.921fb54442d18p+1),
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+  .abs_mask = V2 (0x7fffffffffffffff),
+};
+
+#define AllMask v_u64 (0xffffffffffffffff)
+#define Oneu 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-53.  */
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+  return v_call_f64 (acos, x, y, special);
+}
+#endif
+
+/* Double-precision implementation of vector acos(x).
+
+   For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
+   rounding.
+   If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
+   approximation.
+
+   For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+   approximation of asin is an odd polynomial:
+
+     acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+   The largest observed error in this region is 1.18 ulps,
+   _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
+				       want 0x1.0d54d1985c069p+0.
+
+   For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+     acos(x) = y + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 1.52 ulps,
+   _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
+				       want 0x1.edbbedf8a7d6cp-1.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+  /* A single comparison for One, Small and QNaN.  */
+  uint64x2_t special
+      = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
+		   v_u64 (Oneu - Small));
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, x, AllMask);
+#endif
+
+  uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5));
+
+  /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+     z2 = x ^ 2         and z = |x|     , if |x| < 0.5
+     z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5.  */
+  float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x),
+			      vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax));
+  float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
+
+  /* Use a single polynomial approximation P for both intervals.  */
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
+  float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+  /* Finalize polynomial: z + z * z2 * P(z2).  */
+  p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+  /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for  |x| < 0.5
+	       = 2 Q(|x|)               , for  0.5 < x < 1.0
+	       = pi - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
+  float64x2_t y = vbslq_f64 (d->abs_mask, p, x);
+
+  uint64x2_t is_neg = vcltzq_f64 (x);
+  float64x2_t off = vreinterpretq_f64_u64 (
+      vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi)));
+  float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0));
+  float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off);
+
+  return vfmaq_f64 (add, mul, y);
+}
+
+TEST_SIG (V, D, 1, acos, -1.0, 1.0)
+TEST_ULP (V_NAME_D1 (acos), 1.02)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
+TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c
new file mode 100644
index 000000000000..e200f792c764
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c
@@ -0,0 +1,115 @@
+/*
+ * Single-precision vector acos(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t poly[5];
+  float32x4_t pi_over_2f, pif;
+} data = {
+  /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))  on
+     [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 .  */
+  .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+	    V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+  .pi_over_2f = V4 (0x1.921fb6p+0f),
+  .pif = V4 (0x1.921fb6p+1f),
+};
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Small 0x32800000 /* 2^-26.  */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (acosf, x, y, special);
+}
+#endif
+
+/* Single-precision implementation of vector acos(x).
+
+   For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
+   rounding.
+   If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
+   approximation.
+
+   For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+   approximation of asin is an odd polynomial:
+
+     acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+    The largest observed error in this region is 1.26 ulps,
+      _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0.
+
+    For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+      acos(x) = y + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 1.32 ulps,
+   _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
+				 want 0x1.feb32ep-1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+#if WANT_SIMD_EXCEPT
+  /* A single comparison for One, Small and QNaN.  */
+  uint32x4_t special
+      = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, x, v_u32 (0xffffffff));
+#endif
+
+  float32x4_t ax = vreinterpretq_f32_u32 (ia);
+  uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half));
+
+  /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+     z2 = x ^ 2         and z = |x|     , if |x| < 0.5
+     z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5.  */
+  float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x),
+			      vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+  float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2));
+
+  /* Use a single polynomial approximation P for both intervals.  */
+  float32x4_t p = v_horner_4_f32 (z2, d->poly);
+  /* Finalize polynomial: z + z * z2 * P(z2).  */
+  p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+  /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for  |x| < 0.5
+	       = 2 Q(|x|)               , for  0.5 < x < 1.0
+	       = pi - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
+  float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x);
+
+  uint32x4_t is_neg = vcltzq_f32 (x);
+  float32x4_t off = vreinterpretq_f32_u32 (
+      vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg));
+  float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0));
+  float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off);
+
+  return vfmaq_f32 (add, mul, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (acos)
+
+TEST_SIG (V, F, 1, acos, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (acos), 0.82)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c
new file mode 100644
index 000000000000..55d8ed5a421e
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c
@@ -0,0 +1,65 @@
+/*
+ * Double-precision vector acosh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 1
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+  struct v_log1p_data log1p_consts;
+  uint64x2_t one, thresh;
+} data = {
+  .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+  .one = V2 (0x3ff0000000000000),
+  .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1).  */
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special,
+	      const struct v_log1p_data *d)
+{
+  return v_call_f64 (acosh, x, log1p_inline (y, d), special);
+}
+
+/* Vector approximation for double-precision acosh, based on log1p.
+   The largest observed error is 3.02 ULP in the region where the
+   argument to log1p falls in the k=0 interval, i.e. x close to 1:
+   _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
+				       want 0x1.f2d6d823bc9e2p-5.  */
+VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t special
+      = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh);
+  float64x2_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (special)))
+    x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
+#endif
+
+  float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
+  float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
+  y = vmulq_f64 (y, xm1);
+  y = vsqrtq_f64 (y);
+  y = vaddq_f64 (xm1, y);
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (special_arg, y, special, &d->log1p_consts);
+  return log1p_inline (y, &d->log1p_consts);
+}
+
+TEST_SIG (V, D, 1, acosh, 1.0, 10.0)
+TEST_ULP (V_NAME_D1 (acosh), 2.53)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
+TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
+TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
+TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c
new file mode 100644
index 000000000000..029d457cfa8a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision vector acosh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+#define SquareLim 0x1p64
+
+const static struct data
+{
+  struct v_log1pf_data log1pf_consts;
+  uint32x4_t one;
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
+
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
+	      const struct v_log1pf_data *d)
+{
+  return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
+}
+
+/* Vector approximation for single-precision acosh, based on log1p. Maximum
+   error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+   is 3.00 ULP:
+   _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+				 want 0x1.ef0a7cp-4.
+   With exceptions disabled, we can compute u with a shorter dependency chain,
+   which gives maximum error of 3.22 ULP:
+   _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+				 want 0x1.fdcdd2p-5.  */
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
+
+#if WANT_SIMD_EXCEPT
+  /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+     only xm1 to calculate u, as operating on x will trigger invalid for NaN.
+     Widening sign-extend special predicate in order to mask with it.  */
+  uint32x4_t p
+      = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special)));
+  float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
+  float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
+#else
+  float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+  float32x4_t u
+      = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
+#endif
+
+  float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+
+  if (unlikely (v_any_u16h (special)))
+    return special_case (x, y, special, &d->log1pf_consts);
+  return log1pf_inline (y, &d->log1pf_consts);
+}
+
+HALF_WIDTH_ALIAS_F1 (acosh)
+
+TEST_SIG (V, F, 1, acosh, 1.0, 10.0)
+#if WANT_SIMD_EXCEPT
+TEST_ULP (V_NAME_F1 (acosh), 2.50)
+#else
+TEST_ULP (V_NAME_F1 (acosh), 2.78)
+#endif
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
+TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
+TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
+TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c
new file mode 100644
index 000000000000..c751d9264a12
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c
@@ -0,0 +1,130 @@
+/*
+ * Double-precision vector asin(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10;
+  float64x2_t pi_over_2;
+  uint64x2_t abs_mask;
+  double c1, c3, c5, c7, c9, c11;
+} data = {
+  /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+     on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
+  .c0 = V2 (0x1.555555555554ep-3),	  .c1 = 0x1.3333333337233p-4,
+  .c2 = V2 (0x1.6db6db67f6d9fp-5),	  .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = V2 (0x1.6e8b264d467d6p-6),	  .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = V2 (0x1.c86a22cd9389dp-7),	  .c7 = 0x1.856073c22ebbep-7,
+  .c8 = V2 (0x1.fd1151acb6bedp-8),	  .c9 = 0x1.087182f799c1dp-6,
+  .c10 = V2 (-0x1.6602748120927p-7),	  .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
+};
+
+#define AllMask v_u64 (0xffffffffffffffff)
+#define One 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-12.  */
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+  return v_call_f64 (asin, x, y, special);
+}
+#endif
+
+/* Double-precision implementation of vector asin(x).
+
+   For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+   rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+   following approximation.
+
+   For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+   approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+   The largest observed error in this region is 1.01 ulps,
+   _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2
+				       want 0x1.ed78525a927eep-2.
+
+   For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+     asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 2.69 ulps,
+   _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+				       want 0x1.1111dd54ddf99p-1.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t ax = vabsq_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+  /* Special values need to be computed with scalar fallbacks so
+     that appropriate exceptions are raised.  */
+  uint64x2_t special
+      = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
+		   v_u64 (One - Small));
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, x, AllMask);
+#endif
+
+  uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
+
+  /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+     z = x ^ 2 and y = |x|            , if |x| < 0.5
+     z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
+  float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x),
+			      vfmsq_n_f64 (v_f64 (0.5), ax, 0.5));
+  float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2));
+
+  /* Use a single polynomial approximation P for both intervals.  */
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
+
+  /* order-11 estrin.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+  float64x2_t p = vfmaq_f64 (p07, z16, p811);
+
+  /* Finalize polynomial: z + z * z2 * P(z2).  */
+  p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+  /* asin(|x|) = Q(|x|)         , for |x| < 0.5
+	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
+  float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0));
+
+  /* Copy sign.  */
+  return vbslq_f64 (d->abs_mask, y, x);
+}
+
+TEST_SIG (V, D, 1, asin, -1.0, 1.0)
+TEST_ULP (V_NAME_D1 (asin), 2.20)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
+TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c
new file mode 100644
index 000000000000..970feb37e1d5
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c
@@ -0,0 +1,106 @@
+/*
+ * Single-precision vector asin(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t poly[5];
+  float32x4_t pi_over_2f;
+} data = {
+  /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))  on
+     [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 .  */
+  .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+	    V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+  .pi_over_2f = V4 (0x1.921fb6p+0f),
+};
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Small 0x39800000 /* 2^-12.  */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (asinf, x, y, special);
+}
+#endif
+
+/* Single-precision implementation of vector asin(x).
+
+   For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+   rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+   following approximation.
+
+   For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+   approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+    The largest observed error in this region is 0.83 ulps,
+      _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2.
+
+    For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+    asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 2.41 ulps,
+     _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+#if WANT_SIMD_EXCEPT
+  /* Special values need to be computed with scalar fallbacks so
+     that appropriate fp exceptions are raised.  */
+  uint32x4_t special
+      = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, x, v_u32 (0xffffffff));
+#endif
+
+  float32x4_t ax = vreinterpretq_f32_u32 (ia);
+  uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half));
+
+  /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+     z = x ^ 2 and y = |x|            , if |x| < 0.5
+     z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
+  float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
+			      vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+  float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
+
+  /* Use a single polynomial approximation P for both intervals.  */
+  float32x4_t p = v_horner_4_f32 (z2, d->poly);
+  /* Finalize polynomial: z + z * z2 * P(z2).  */
+  p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+  /* asin(|x|) = Q(|x|)         , for |x| < 0.5
+	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
+  float32x4_t y
+      = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0));
+
+  /* Copy sign.  */
+  return vbslq_f32 (v_u32 (AbsMask), y, x);
+}
+
+HALF_WIDTH_ALIAS_F1 (asin)
+
+TEST_SIG (V, F, 1, asin, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (asin), 1.91)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c
new file mode 100644
index 000000000000..550302826bd9
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c
@@ -0,0 +1,242 @@
+/*
+ * Double-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "test_defs.h"
+#include "test_sig.h"
+#include "v_math.h"
+
+const static struct data
+{
+  uint64x2_t huge_bound, abs_mask, off, mask;
+#if WANT_SIMD_EXCEPT
+  float64x2_t tiny_bound;
+#endif
+  float64x2_t lc0, lc2;
+  double lc1, lc3, ln2, lc4;
+
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
+  double c1, c3, c5, c7, c9, c11, c13, c15;
+
+} data = {
+
+#if WANT_SIMD_EXCEPT
+  .tiny_bound = V2 (0x1p-26),
+#endif
+  /* Even terms of polynomial s.t. asinh(x) is approximated by
+     asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+     Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2).  */
+
+  .c0 = V2 (-0x1.55555555554a7p-3),
+  .c1 = 0x1.3333333326c7p-4,
+  .c2 = V2 (-0x1.6db6db68332e6p-5),
+  .c3 = 0x1.f1c71b26fb40dp-6,
+  .c4 = V2 (-0x1.6e8b8b654a621p-6),
+  .c5 = 0x1.1c4daa9e67871p-6,
+  .c6 = V2 (-0x1.c9871d10885afp-7),
+  .c7 = 0x1.7a16e8d9d2ecfp-7,
+  .c8 = V2 (-0x1.3ddca533e9f54p-7),
+  .c9 = 0x1.0becef748dafcp-7,
+  .c10 = V2 (-0x1.b90c7099dd397p-8),
+  .c11 = 0x1.541f2bb1ffe51p-8,
+  .c12 = V2 (-0x1.d217026a669ecp-9),
+  .c13 = 0x1.0b5c7977aaf7p-9,
+  .c14 = V2 (-0x1.e0f37daef9127p-11),
+  .c15 = 0x1.388b5fe542a6p-12,
+  .c16 = V2 (-0x1.021a48685e287p-14),
+  .c17 = V2 (0x1.93d4ba83d34dap-18),
+
+  .lc0 = V2 (-0x1.ffffffffffff7p-2),
+  .lc1 = 0x1.55555555170d4p-2,
+  .lc2 = V2 (-0x1.0000000399c27p-2),
+  .lc3 = 0x1.999b2e90e94cap-3,
+  .lc4 = -0x1.554e550bd501ep-3,
+  .ln2 = 0x1.62e42fefa39efp-1,
+
+  .off = V2 (0x3fe6900900000000),
+  .huge_bound = V2 (0x5fe0000000000000),
+  .abs_mask = V2 (0x7fffffffffffffff),
+  .mask = V2 (0xfffULL << 52),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
+	      uint64x2_t special)
+{
+  /* Copy sign.  */
+  y = vbslq_f64 (abs_mask, y, x);
+  return v_call_f64 (asinh, x, y, special);
+}
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static inline float64x2_t
+log_inline (float64x2_t xm, const struct data *d)
+{
+
+  uint64x2_t u = vreinterpretq_u64_f64 (xm);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+  struct entry e = lookup (u_off);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
+  float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
+  y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
+  y = vfmaq_f64 (p, r2, y);
+  return vfmaq_f64 (hi, y, r2);
+}
+
+/* Double-precision implementation of vector asinh(x).
+   asinh is very sensitive around 1, so it is impractical to devise a single
+   low-cost algorithm which is sufficiently accurate on a wide range of input.
+   Instead we use two different algorithms:
+   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+   where log(x) is an optimized log approximation, and P(x) is a polynomial
+   shared with the scalar routine. The greatest observed error 2.79 ULP, in
+   |x| >= 1:
+   _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
+				       want  0x1.ffffd003219ddp-1.  */
+VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t ax = vabsq_f64 (x);
+
+  uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+  uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
+  uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+  special = vorrq_u64 (special, tiny);
+#else
+  uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
+#endif
+
+  /* Option 1: |x| >= 1.
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+     overflow, by setting special lanes to 1. These will be fixed later.  */
+  float64x2_t option_1 = v_f64 (0);
+  if (likely (v_any_u64 (gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      float64x2_t xm = v_zerofy_f64 (ax, special);
+#else
+      float64x2_t xm = ax;
+#endif
+      option_1 = log_inline (
+	  vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
+    }
+
+  /* Option 2: |x| < 1.
+     Compute asinh(x) using a polynomial.
+     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+     overflow, and tiny lanes, which will underflow, by setting them to 0. They
+     will be fixed later, either by selecting x or falling back to the scalar
+     special-case. The largest observed error in this region is 1.47 ULPs:
+     _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+					 want 0x1.c1d6bf874019cp-1.  */
+  float64x2_t option_2 = v_f64 (0);
+
+  if (likely (v_any_u64 (vceqzq_u64 (gt1))))
+    {
+
+#if WANT_SIMD_EXCEPT
+      ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
+#endif
+      float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
+      /* Order-17 Pairwise Horner scheme.  */
+      float64x2_t c13 = vld1q_f64 (&d->c1);
+      float64x2_t c57 = vld1q_f64 (&d->c5);
+      float64x2_t c911 = vld1q_f64 (&d->c9);
+      float64x2_t c1315 = vld1q_f64 (&d->c13);
+
+      float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
+      float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
+      float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
+      float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
+      float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
+      float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
+      float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
+      float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
+      float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
+
+      float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
+      p = vfmaq_f64 (p1213, z2, p);
+      p = vfmaq_f64 (p1011, z2, p);
+      p = vfmaq_f64 (p89, z2, p);
+
+      p = vfmaq_f64 (p67, z2, p);
+      p = vfmaq_f64 (p45, z2, p);
+
+      p = vfmaq_f64 (p23, z2, p);
+
+      p = vfmaq_f64 (p01, z2, p);
+      option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
+#if WANT_SIMD_EXCEPT
+      option_2 = vbslq_f64 (tiny, x, option_2);
+#endif
+    }
+
+  /* Choose the right option for each lane.  */
+  float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
+  if (unlikely (v_any_u64 (special)))
+    {
+      return special_case (x, y, d->abs_mask, special);
+    }
+  /* Copy sign.  */
+  return vbslq_f64 (d->abs_mask, y, x);
+}
+
+TEST_SIG (V, D, 1, asinh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (asinh), 2.29)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+   Ensures the v_sel is choosing the right option in all cases.  */
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5)
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2)
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c
new file mode 100644
index 000000000000..6a96f6ee9f4b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c
@@ -0,0 +1,89 @@
+/*
+ * Single-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+const static struct data
+{
+  struct v_log1pf_data log1pf_consts;
+  float32x4_t one;
+  uint32x4_t big_bound;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t tiny_bound;
+#endif
+} data = {
+  .one = V4 (1),
+  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+  .big_bound = V4 (0x5f800000), /* asuint(0x1p64).  */
+#if WANT_SIMD_EXCEPT
+  .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30).  */
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+	      uint32x4_t special, const struct data *d)
+{
+  return v_call_f32 (
+      asinhf, x,
+      vreinterpretq_f32_u32 (veorq_u32 (
+	  sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+      special);
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+   Worst-case error is 2.59 ULP:
+   _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+				 want 0x1.d449c4p-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+{
+  const struct data *dat = ptr_barrier (&data);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+  uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
+  float32x4_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+  /* Sidestep tiny and large values to avoid inadvertently triggering
+     under/overflow.  */
+  special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
+  if (unlikely (v_any_u32 (special)))
+    {
+      ax = v_zerofy_f32 (ax, special);
+      x = v_zerofy_f32 (x, special);
+    }
+#endif
+
+  /* asinh(x) = log(x + sqrt(x * x + 1)).
+     For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+  float32x4_t d
+      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+  float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (special_arg, sign, y, special, dat);
+  return vreinterpretq_f32_u32 (veorq_u32 (
+      sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
+}
+
+HALF_WIDTH_ALIAS_F1 (asinh)
+
+TEST_SIG (V, F, 1, asinh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (asinh), 2.10)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c
new file mode 100644
index 000000000000..26d264321068
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c
@@ -0,0 +1,135 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+  float64x2_t pi_over_2;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+	      [2**-1022, 1.0].  */
+  .c0 = V2 (-0x1.5555555555555p-2),	  .c1 = 0x1.99999999996c1p-3,
+  .c2 = V2 (-0x1.2492492478f88p-3),	  .c3 = 0x1.c71c71bc3951cp-4,
+  .c4 = V2 (-0x1.745d160a7e368p-4),	  .c5 = 0x1.3b139b6a88ba1p-4,
+  .c6 = V2 (-0x1.11100ee084227p-4),	  .c7 = 0x1.e1d0f9696f63bp-5,
+  .c8 = V2 (-0x1.aebfe7b418581p-5),	  .c9 = 0x1.842dbe9b0d916p-5,
+  .c10 = V2 (-0x1.5d30140ae5e99p-5),	  .c11 = 0x1.338e31eb2fbbcp-5,
+  .c12 = V2 (-0x1.00e6eece7de8p-5),	  .c13 = 0x1.860897b29e5efp-6,
+  .c14 = V2 (-0x1.0051381722a59p-6),	  .c15 = 0x1.14e9dc19a4a4ep-7,
+  .c16 = V2 (-0x1.d0062b42fe3bfp-9),	  .c17 = 0x1.17739e210171ap-10,
+  .c18 = V2 (-0x1.ab24da7be7402p-13),	  .c19 = 0x1.358851160a528p-16,
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30).  */
+#define BigBound 0x4340000000000000  /* asuint64(0x1p53).  */
+
+/* Fast implementation of vector atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+				       want 0x1.9225645bdd7c3p-1.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+  float64x2_t c1315 = vld1q_f64 (&d->c13);
+  float64x2_t c1719 = vld1q_f64 (&d->c17);
+
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t sign = vandq_u64 (ix, SignMask);
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000));
+  uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)),
+				  v_u64 (BigBound - TinyBound));
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
+#endif
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x);
+  float64x2_t shift = vreinterpretq_f64_u64 (
+      vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2)));
+  /* Use absolute value only when needed (odd powers of z).  */
+  float64x2_t az = vbslq_f64 (
+      SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z);
+
+  /* Calculate the polynomial approximation.
+     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+     full scheme to avoid underflow in x^16.
+     The order 19 polynomial P approximates
+     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+  float64x2_t z2 = vmulq_f64 (z, z);
+  float64x2_t x2 = vmulq_f64 (z2, z2);
+  float64x2_t x4 = vmulq_f64 (x2, x2);
+  float64x2_t x8 = vmulq_f64 (x4, x4);
+
+  /* estrin_7.  */
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+  /* estrin_11.  */
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+  float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+  float64x2_t y = vfmaq_f64 (p07, p819, x8);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
+  y = vaddq_f64 (y, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign));
+  return y;
+}
+
+TEST_SIG (V, D, 1, atan, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (atan), 1.78)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
+TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
+TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c
new file mode 100644
index 000000000000..18c4b70b92f6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c
@@ -0,0 +1,171 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+  float64x2_t pi_over_2;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+  uint64x2_t zeroinfnan, minustwo;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+	      [2**-1022, 1.0].  */
+  .c0 = V2 (-0x1.5555555555555p-2),
+  .c1 = 0x1.99999999996c1p-3,
+  .c2 = V2 (-0x1.2492492478f88p-3),
+  .c3 = 0x1.c71c71bc3951cp-4,
+  .c4 = V2 (-0x1.745d160a7e368p-4),
+  .c5 = 0x1.3b139b6a88ba1p-4,
+  .c6 = V2 (-0x1.11100ee084227p-4),
+  .c7 = 0x1.e1d0f9696f63bp-5,
+  .c8 = V2 (-0x1.aebfe7b418581p-5),
+  .c9 = 0x1.842dbe9b0d916p-5,
+  .c10 = V2 (-0x1.5d30140ae5e99p-5),
+  .c11 = 0x1.338e31eb2fbbcp-5,
+  .c12 = V2 (-0x1.00e6eece7de8p-5),
+  .c13 = 0x1.860897b29e5efp-6,
+  .c14 = V2 (-0x1.0051381722a59p-6),
+  .c15 = 0x1.14e9dc19a4a4ep-7,
+  .c16 = V2 (-0x1.d0062b42fe3bfp-9),
+  .c17 = 0x1.17739e210171ap-10,
+  .c18 = V2 (-0x1.ab24da7be7402p-13),
+  .c19 = 0x1.358851160a528p-16,
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+  .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
+  .minustwo = V2 (0xc000000000000000),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls).  */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
+	      uint64x2_t sign_xy, uint64x2_t cmp)
+{
+  /* Account for the sign of x and y.  */
+  ret = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+  return v_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline uint64x2_t
+zeroinfnan (uint64x2_t i, const struct data *d)
+{
+  /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1).  */
+  return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
+}
+
+/* Fast implementation of vector atan2.
+   Maximum observed error is 2.8 ulps:
+   _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+	got 0x1.92d628ab678ccp-1
+       want 0x1.92d628ab678cfp-1.  */
+float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t iy = vreinterpretq_u64_f64 (y);
+
+  uint64x2_t special_cases
+      = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+
+  uint64x2_t sign_x = vandq_u64 (ix, SignMask);
+  uint64x2_t sign_y = vandq_u64 (iy, SignMask);
+  uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
+
+  float64x2_t ax = vabsq_f64 (x);
+  float64x2_t ay = vabsq_f64 (y);
+
+  uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+  uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
+
+  /* Set up z for call to atan.  */
+  float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+  float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
+  float64x2_t z = vdivq_f64 (n, q);
+
+  /* Work out the correct shift.  */
+  float64x2_t shift
+      = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
+  shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
+  shift = vmulq_f64 (shift, d->pi_over_2);
+
+  /* Calculate the polynomial approximation.
+     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+     full scheme to avoid underflow in x^16.
+     The order 19 polynomial P approximates
+     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+  float64x2_t z2 = vmulq_f64 (z, z);
+  float64x2_t x2 = vmulq_f64 (z2, z2);
+  float64x2_t x4 = vmulq_f64 (x2, x2);
+  float64x2_t x8 = vmulq_f64 (x4, x4);
+
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+  float64x2_t c1315 = vld1q_f64 (&d->c13);
+  float64x2_t c1719 = vld1q_f64 (&d->c17);
+
+  /* estrin_7.  */
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+  /* estrin_11.  */
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+  float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+  float64x2_t ret = vfmaq_f64 (p07, p819, x8);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
+  ret = vaddq_f64 (ret, shift);
+
+  if (unlikely (v_any_u64 (special_cases)))
+    return special_case (y, x, ret, sign_xy, special_cases);
+
+  /* Account for the sign of x and y.  */
+  ret = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+
+  return ret;
+}
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+TEST_SIG (V, D, 2, atan2)
+// TODO tighten this once __v_atan2 is fixed
+TEST_ULP (V_NAME_D2 (atan2), 2.9)
+TEST_DISABLE_FENV (V_NAME_D2 (atan2))
+TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000)
+TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c
new file mode 100644
index 000000000000..632014249ab0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c
@@ -0,0 +1,127 @@
+/*
+ * Single-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t c0, pi_over_2, c4, c6, c2;
+  float c1, c3, c5, c7;
+  uint32x4_t comp_const;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+     [2**-128, 1.0].
+     Generated using fpminimax between FLT_MIN and 1.  */
+  .c0 = V4 (-0x1.55555p-2f),	    .c1 = 0x1.99935ep-3f,
+  .c2 = V4 (-0x1.24051ep-3f),	    .c3 = 0x1.bd7368p-4f,
+  .c4 = V4 (-0x1.491f0ep-4f),	    .c5 = 0x1.93a2c0p-5f,
+  .c6 = V4 (-0x1.4c3c60p-6f),	    .c7 = 0x1.01fd88p-8f,
+  .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
+};
+
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
+	      uint32x4_t sign_xy, uint32x4_t cmp)
+{
+  /* Account for the sign of y.  */
+  ret = vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+  return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline uint32x4_t
+zeroinfnan (uint32x4_t i, const struct data *d)
+{
+  /* 2 * i - 1 >= 2 * 0x7f800000lu - 1.  */
+  return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+   _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+						 want 0x1.967f00p-1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t iy = vreinterpretq_u32_f32 (y);
+
+  uint32x4_t special_cases
+      = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+
+  uint32x4_t sign_x = vandq_u32 (ix, SignMask);
+  uint32x4_t sign_y = vandq_u32 (iy, SignMask);
+  uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y);
+
+  float32x4_t ax = vabsq_f32 (x);
+  float32x4_t ay = vabsq_f32 (y);
+
+  uint32x4_t pred_xlt0 = vcltzq_f32 (x);
+  uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
+
+  /* Set up z for call to atanf.  */
+  float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+  float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
+  float32x4_t z = vdivq_f32 (n, q);
+
+  /* Work out the correct shift.  */
+  float32x4_t shift = vreinterpretq_f32_u32 (
+      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
+  shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
+  shift = vmulq_f32 (shift, d->pi_over_2);
+
+  /* Calculate the polynomial approximation.
+     Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+     a standard implementation using z8 creates spurious underflow
+     in the very last fma (when z^8 is small enough).
+     Therefore, we split the last fma into a mul and an fma.
+     Horner and single-level Estrin have higher errors that exceed
+     threshold.  */
+  float32x4_t z2 = vmulq_f32 (z, z);
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+
+  float32x4_t c1357 = vld1q_f32 (&d->c1);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
+  float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+  float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
+
+  float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
+
+  /* y = shift + z * P(z^2).  */
+  ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return special_case (y, x, ret, sign_xy, special_cases);
+    }
+
+  /* Account for the sign of y.  */
+  return vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+}
+
+HALF_WIDTH_ALIAS_F2 (atan2)
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+TEST_SIG (V, F, 2, atan2)
+TEST_DISABLE_FENV (V_NAME_F2 (atan2))
+TEST_ULP (V_NAME_F2 (atan2), 2.46)
+TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
+TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c
new file mode 100644
index 000000000000..61927c9b261a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c
@@ -0,0 +1,109 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
+
+static const struct data
+{
+  float32x4_t poly[8];
+  float32x4_t pi_over_2;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+     [2**-128, 1.0].
+     Generated using fpminimax between FLT_MIN and 1.  */
+  .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
+	    V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
+	    V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
+  .pi_over_2 = V4 (0x1.921fb6p+0f),
+};
+
+#define SignMask v_u32 (0x80000000)
+
+#define P(i) d->poly[i]
+
+#define TinyBound 0x30800000 /* asuint(0x1p-30).  */
+#define BigBound 0x4e800000  /* asuint(0x1p30).  */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (atanf, x, y, special);
+}
+#endif
+
+/* Fast implementation of vector atanf based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+   _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t sign = vandq_u32 (ix, SignMask);
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
+  uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
+				  v_u32 (BigBound - TinyBound));
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, x, v_u32 (-1));
+#endif
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x);
+  float32x4_t shift = vreinterpretq_f32_u32 (
+      vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2)));
+  /* Use absolute value only when needed (odd powers of z).  */
+  float32x4_t az = vbslq_f32 (
+      SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z);
+
+  /* Calculate the polynomial approximation.
+     Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+     a standard implementation using z8 creates spurious underflow
+     in the very last fma (when z^8 is small enough).
+     Therefore, we split the last fma into a mul and an fma.
+     Horner and single-level Estrin have higher errors that exceed
+     threshold.  */
+  float32x4_t z2 = vmulq_f32 (z, z);
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+
+  float32x4_t y = vfmaq_f32 (
+      v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
+      vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
+
+  /* y = shift + z * P(z^2).  */
+  y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign));
+
+  return y;
+}
+
+HALF_WIDTH_ALIAS_F1 (atan)
+
+TEST_SIG (V, F, 1, atan, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (atan), 2.5)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c
new file mode 100644
index 000000000000..c2f9585dd29b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c
@@ -0,0 +1,75 @@
+/*
+ * Double-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+  struct v_log1p_data log1p_consts;
+  uint64x2_t one;
+  uint64x2_t sign_mask;
+} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+	   .one = V2 (0x3ff0000000000000),
+	   .sign_mask = V2 (0x8000000000000000) };
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
+	      uint64x2_t special, const struct data *d)
+{
+  y = log1p_inline (y, &d->log1p_consts);
+  return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
+		     vmulq_f64 (halfsign, y), special);
+}
+
+/* Approximation for vector double-precision atanh(x) using modified log1p.
+   The greatest observed error is 3.31 ULP:
+   _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+				      want 0x1.ffd8ff31b501cp-6.  */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t ia = vreinterpretq_u64_f64 (ax);
+  uint64x2_t special = vcgeq_u64 (ia, d->one);
+
+#if WANT_SIMD_EXCEPT
+  ax = v_zerofy_f64 (ax, special);
+#endif
+
+  float64x2_t y;
+  y = vaddq_f64 (ax, ax);
+  y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
+
+  if (unlikely (v_any_u64 (special)))
+#if WANT_SIMD_EXCEPT
+    return special_case (x, halfsign, y, special, d);
+#else
+    return special_case (ax, halfsign, y, special, d);
+#endif
+
+  y = log1p_inline (y, &d->log1p_consts);
+  return vmulq_f64 (y, halfsign);
+}
+
+TEST_SIG (V, D, 1, atanh, -1.0, 1.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
+TEST_ULP (V_NAME_D1 (atanh), 3.32)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100)
+/* atanh is asymptotic at 1, which is the default control value - have to set
+   -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+   control lane is irrelevant if fp exceptions are disabled).  */
+TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c
new file mode 100644
index 000000000000..313d15ca6391
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c
@@ -0,0 +1,90 @@
+/*
+ * Single-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+const static struct data
+{
+  struct v_log1pf_data log1pf_consts;
+  uint32x4_t one;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t tiny_bound;
+#endif
+} data = {
+  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+  .one = V4 (0x3f800000),
+#if WANT_SIMD_EXCEPT
+  /* 0x1p-12, below which atanhf(x) rounds to x.  */
+  .tiny_bound = V4 (0x39800000),
+#endif
+};
+
+#define AbsMask v_u32 (0x7fffffff)
+#define Half v_u32 (0x3f000000)
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+	      uint32x4_t special)
+{
+  return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+		     vmulq_f32 (halfsign, y), special);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+   The maximum error is 2.93 ULP:
+   _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+				want 0x1.f4dcf8p-5.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t special
+      = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound));
+  /* Side-step special cases by setting those lanes to 0, which will trigger no
+     exceptions. These will be fixed up later.  */
+  if (unlikely (v_any_u32 (special)))
+    ax = v_zerofy_f32 (ax, special);
+#else
+  uint32x4_t special = vcgeq_u32 (iax, d->one);
+#endif
+
+  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+			     vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+  y = log1pf_inline (y, &d->log1pf_consts);
+
+  /* If exceptions not required, pass ax to special-case for shorter dependency
+     chain. If exceptions are required ax will have been zerofied, so have to
+     pass x.  */
+  if (unlikely (v_any_u32 (special)))
+#if WANT_SIMD_EXCEPT
+    return special_case (x, halfsign, y, special);
+#else
+    return special_case (ax, halfsign, y, special);
+#endif
+  return vmulq_f32 (halfsign, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (atanh)
+
+TEST_SIG (V, F, 1, atanh, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (atanh), 2.44)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000)
+/* atanh is asymptotic at 1, which is the default control value - have to set
+ -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+ control lane is irrelevant if fp exceptions are disabled).  */
+TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c
new file mode 100644
index 000000000000..8e72e5b566fc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c
@@ -0,0 +1,127 @@
+/*
+ * Double-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f64.h"
+
+const static struct data
+{
+  float64x2_t poly[4], one_third, shift;
+  int64x2_t exp_bias;
+  uint64x2_t abs_mask, tiny_bound;
+  uint32x4_t thresh;
+  double table[5];
+} data = {
+  .shift = V2 (0x1.8p52),
+  .poly = { /* Generated with fpminimax in [0.5, 1].  */
+            V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1),
+	    V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) },
+  .exp_bias = V2 (1022),
+  .abs_mask = V2(0x7fffffffffffffff),
+  .tiny_bound = V2(0x0010000000000000), /* Smallest normal.  */
+  .thresh = V4(0x7fe00000),   /* asuint64 (infinity) - tiny_bound.  */
+  .one_third = V2(0x1.5555555555555p-2),
+  .table = { /* table[i] = 2^((i - 2) / 3).  */
+             0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
+	     0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 }
+};
+
+#define MantissaMask v_u64 (0x000fffffffffffff)
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
+{
+  return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order
+   polynomial and two Newton iterations.
+
+   The vector version of frexp does not handle subnormals
+   correctly. As a result these need to be handled by the scalar
+   fallback, where accuracy may be worse than that of the vector code
+   path.
+
+   Greatest observed error in the normal range is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   _ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+				       want 0x1.965fe72821e99p+0.  */
+VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+  /* Subnormal, +/-0 and special values.  */
+  uint32x2_t special
+      = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexp, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5));
+  int64x2_t exp_bias = d->exp_bias;
+  uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
+  int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
+     for Newton iterations.  */
+  float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
+  float64x2_t one_third = d->one_third;
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  float64x2_t m_by_3 = vmulq_f64 (m, one_third);
+  float64x2_t two_thirds = vaddq_f64 (one_third, one_third);
+  float64x2_t a
+      = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p);
+  a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a);
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+     is an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  float64x2_t ef = vcvtq_f64_s64 (e);
+  float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third));
+  int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3)));
+  int64x2_t ey = vcvtq_s64_f64 (eb3f);
+
+  float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] };
+  my = vmulq_f64 (my, a);
+
+  /* Vector version of ldexp.  */
+  float64x2_t y = vreinterpretq_f64_s64 (
+      vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52));
+  y = vmulq_f64 (y, my);
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (x, vbslq_f64 (d->abs_mask, y, x), special);
+
+  /* Copy sign.  */
+  return vbslq_f64 (d->abs_mask, y, x);
+}
+
+/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which
+   has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error
+   in the vector path is 1.79 ULP.
+   [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical
+   Functions in Single, Double, Double Extended, and Quadruple Precision.  */
+TEST_ULP (V_NAME_D1 (cbrt), 3.17)
+TEST_SIG (V, D, 1, cbrt, -10.0, 10.0)
+TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c
new file mode 100644
index 000000000000..4e76feb2dd8b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c
@@ -0,0 +1,117 @@
+/*
+ * Single-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
+
+const static struct data
+{
+  float32x4_t poly[4], one_third;
+  float table[5];
+} data = {
+  .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with
+               FPMinimax.  */
+	    V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1),
+	    V4 (0x1.2c74c2p-3) },
+  .table = { /* table[i] = 2^((i - 2) / 3).  */
+	    0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
+  .one_third = V4 (0x1.555556p-2f),
+};
+
+#define SignMask v_u32 (0x80000000)
+#define SmallestNormal v_u32 (0x00800000)
+#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal.  */
+#define MantissaMask v_u32 (0x007fffff)
+#define HalfExp v_u32 (0x3f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special)
+{
+  return v_call_f32 (cbrtf, x, y, vmovl_u16 (special));
+}
+
+static inline float32x4_t
+shifted_lookup (const float *table, int32x4_t i)
+{
+  return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2],
+			table[i[3] + 2] };
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
+   with initial guess obtained by a low-order polynomial. Greatest error
+   is 1.64 ULP. This is observed for every value where the mantissa is
+   0x1.85a2aa and the exponent is a multiple of 3, for example:
+   _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
+				want 0x1.267932p+1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+  /* Subnormal, +/-0 and special values.  */
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh);
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexpf, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5));
+  int32x4_t e
+      = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126));
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly);
+
+  float32x4_t one_third = d->one_third;
+  float32x4_t two_thirds = vaddq_f32 (one_third, one_third);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  float32x4_t m_by_3 = vmulq_f32 (m, one_third);
+  float32x4_t a
+      = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p);
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+     is an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+  float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third);
+  int32x4_t ey = vcvtq_s32_f32 (ef);
+  int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3)));
+
+  float32x4_t my = shifted_lookup (d->table, em3);
+  my = vmulq_f32 (my, a);
+
+  /* Vector version of ldexpf.  */
+  float32x4_t y
+      = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23));
+  y = vmulq_f32 (y, my);
+
+  if (unlikely (v_any_u16h (special)))
+    return special_case (x, vbslq_f32 (SignMask, x, y), special);
+
+  /* Copy sign.  */
+  return vbslq_f32 (SignMask, x, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (cbrt)
+
+TEST_SIG (V, F, 1, cbrt, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (cbrt), 1.15)
+TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c
new file mode 100644
index 000000000000..40ba5ff31f20
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c
@@ -0,0 +1,47 @@
+/*
+ * Double-precision vector sincos function - return-by-value interface.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincos_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+static float64x2x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y)
+{
+  return (float64x2x2_t){ v_call_f64 (sin, x, y.val[0], special),
+			  v_call_f64 (cos, x, y.val[1], special) };
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+   one function call, using shared argument reduction and separate polynomials.
+   Largest observed error is for sin, 3.22 ULP:
+   v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+				       want -0x1.ffe9537d5dbb4p-3.  */
+VPCS_ATTR float64x2x2_t
+_ZGVnN2v_cexpi (float64x2_t x)
+{
+  const struct v_sincos_data *d = ptr_barrier (&v_sincos_data);
+  uint64x2_t special = check_ge_rangeval (x, d);
+
+  float64x2x2_t sc = v_sincos_inline (x, d);
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, special, sc);
+  return sc;
+}
+
+TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin)
+TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
+TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
+#define V_CEXPI_INTERVAL(lo, hi, n)                                           \
+  TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n)                               \
+  TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
+V_CEXPI_INTERVAL (0, 0x1p23, 500000)
+V_CEXPI_INTERVAL (-0, -0x1p23, 500000)
+V_CEXPI_INTERVAL (0x1p23, inf, 10000)
+V_CEXPI_INTERVAL (-0x1p23, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c
new file mode 100644
index 000000000000..e55d99653a66
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c
@@ -0,0 +1,49 @@
+/*
+ * Single-precision vector cexpi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincosf_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+static float32x4x2_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y)
+{
+  return (float32x4x2_t){ v_call_f32 (sinf, x, y.val[0], special),
+			  v_call_f32 (cosf, x, y.val[1], special) };
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+   one function call, using shared argument reduction and separate low-order
+   polynomials.
+   Worst-case error for sin is 1.67 ULP:
+   v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+   Worst-case error for cos is 1.81 ULP:
+   v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6.  */
+VPCS_ATTR float32x4x2_t
+_ZGVnN4v_cexpif (float32x4_t x)
+{
+  const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data);
+  uint32x4_t special = check_ge_rangeval (x, d);
+
+  float32x4x2_t sc = v_sincosf_inline (x, d);
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, special, sc);
+  return sc;
+}
+
+TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos)
+TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
+TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
+#define V_CEXPIF_INTERVAL(lo, hi, n)                                          \
+  TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n)                              \
+  TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
+V_CEXPIF_INTERVAL (0, 0x1p20, 500000)
+V_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
+V_CEXPIF_INTERVAL (0x1p20, inf, 10000)
+V_CEXPIF_INTERVAL (-0x1p20, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c
new file mode 100644
index 000000000000..9f3de4dd5c36
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c
@@ -0,0 +1,92 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  float64x2_t poly[7];
+  float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
+} data = {
+  /* Worst-case error is 3.3 ulp in [-pi/2, pi/2].  */
+  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+	    V2 (-0x1.9e9540300a1p-41) },
+  .inv_pi = V2 (0x1.45f306dc9c883p-2),
+  .pi_1 = V2 (0x1.921fb54442d18p+1),
+  .pi_2 = V2 (0x1.1a62633145c06p-53),
+  .pi_3 = V2 (0x1.c1cd129024e09p-106),
+  .range_val = V2 (0x1p23)
+};
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+  return v_call_f64 (cos, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
+  uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  r = vabsq_f64 (x);
+  cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
+		   vreinterpretq_u64_f64 (d->range_val));
+  if (unlikely (v_any_u64 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       special-case handler later.  */
+    r = vbslq_f64 (cmp, v_f64 (1.0), r);
+#else
+  cmp = vcageq_f64 (x, d->range_val);
+  r = x;
+#endif
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+  n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+  odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+  n = vsubq_f64 (n, v_f64 (0.5f));
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f64 (r, d->pi_1, n);
+  r = vfmsq_f64 (r, d->pi_2, n);
+  r = vfmsq_f64 (r, d->pi_3, n);
+
+  /* sin(r) poly approx.  */
+  r2 = vmulq_f64 (r, r);
+  r3 = vmulq_f64 (r2, r);
+  r4 = vmulq_f64 (r2, r2);
+
+  t1 = vfmaq_f64 (C (4), C (5), r2);
+  t2 = vfmaq_f64 (C (2), C (3), r2);
+  t3 = vfmaq_f64 (C (0), C (1), r2);
+
+  y = vfmaq_f64 (t1, C (6), r4);
+  y = vfmaq_f64 (t2, y, r4);
+  y = vfmaq_f64 (t3, y, r4);
+  y = vfmaq_f64 (r, y, r3);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+TEST_SIG (V, D, 1, cos, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (cos), 3.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c
new file mode 100644
index 000000000000..d2844e44e196
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c
@@ -0,0 +1,89 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  float32x4_t poly[4];
+  float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
+} data = {
+  /* 1.886 ulp error.  */
+  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+	    V4 (0x1.5b2e76p-19f) },
+
+  .pi_1 = V4 (0x1.921fb6p+1f),
+  .pi_2 = V4 (-0x1.777a5cp-24f),
+  .pi_3 = V4 (-0x1.ee59dap-49f),
+
+  .inv_pi = V4 (0x1.45f306p-2f),
+  .range_val = V4 (0x1p20f)
+};
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+  return v_call_f32 (cosf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, r, r2, r3, y;
+  uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  r = vabsq_f32 (x);
+  cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
+		   vreinterpretq_u32_f32 (d->range_val));
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       special-case handler later.  */
+    r = vbslq_f32 (cmp, v_f32 (1.0f), r);
+#else
+  cmp = vcageq_f32 (x, d->range_val);
+  r = x;
+#endif
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+  n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+  odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
+  n = vsubq_f32 (n, v_f32 (0.5f));
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f32 (r, d->pi_1, n);
+  r = vfmsq_f32 (r, d->pi_2, n);
+  r = vfmsq_f32 (r, d->pi_3, n);
+
+  /* y = sin(r).  */
+  r2 = vmulq_f32 (r, r);
+  r3 = vmulq_f32 (r2, r);
+  y = vfmaq_f32 (C (2), C (3), r2);
+  y = vfmaq_f32 (C (1), y, r2);
+  y = vfmaq_f32 (C (0), y, r2);
+  y = vfmaq_f32 (r, y, r3);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+HALF_WIDTH_ALIAS_F1 (cos)
+
+TEST_SIG (V, F, 1, cos, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (cos), 1.4)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c
new file mode 100644
index 000000000000..54407b23aa9d
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c
@@ -0,0 +1,107 @@
+/*
+ * Double-precision vector cosh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t poly[3];
+  float64x2_t inv_ln2;
+  double ln2[2];
+  float64x2_t shift, thres;
+  uint64x2_t index_mask, special_bound;
+} data = {
+  .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
+	    V2 (0x1.5555576a59599p-5), },
+
+  .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2.  */
+  /* -ln2/N.  */
+  .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
+  .shift = V2 (0x1.8p+52),
+  .thres = V2 (704.0),
+
+  .index_mask = V2 (0xff),
+  /* 0x1.6p9, above which exp overflows.  */
+  .special_bound = V2 (0x4086000000000000),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+  return v_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from v_exp_tail, with no
+   special-case handling or tail.  */
+static inline float64x2_t
+exp_inline (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* n = round(x/(ln2/N)).  */
+  float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
+  uint64x2_t u = vreinterpretq_u64_f64 (z);
+  float64x2_t n = vsubq_f64 (z, d->shift);
+
+  /* r = x - n*ln2/N.  */
+  float64x2_t ln2 = vld1q_f64 (d->ln2);
+  float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
+  r = vfmaq_laneq_f64 (r, n, ln2, 1);
+
+  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
+  uint64x2_t i = vandq_u64 (u, d->index_mask);
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
+  y = vfmaq_f64 (d->poly[0], y, r);
+  y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (__v_exp_tail_data, i);
+  float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  return vfmaq_f64 (s, y, s);
+}
+
+/* Approximation for vector double-precision cosh(x) using exp_inline.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the scalar fall-back region, so is the
+   same as the scalar routine, 1.93 ULP:
+   _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+				       want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.54 ULP:
+   _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+				       want 0x1.f711dcb0c77b1p+7.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t special
+      = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
+
+  /* Up to the point that exp overflows, we can use it to calculate cosh by
+     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+  float64x2_t t = exp_inline (ax);
+  float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+  float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+
+  /* Fall back to scalar for any special cases.  */
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, vaddq_f64 (half_t, half_over_t), special);
+
+  return vaddq_f64 (half_t, half_over_t);
+}
+
+TEST_SIG (V, D, 1, cosh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (cosh), 1.43)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c
new file mode 100644
index 000000000000..f1ed3e5161fd
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c
@@ -0,0 +1,92 @@
+/*
+ * Single-precision vector cosh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_expf_inline.h"
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  struct v_expf_data expf_consts;
+  uint32x4_t tiny_bound;
+  float32x4_t bound;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t special_bound;
+#endif
+} data = {
+  .expf_consts = V_EXPF_DATA,
+  .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this.  */
+  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+  .bound = V4 (0x1.5a92d8p+6),
+#if WANT_SIMD_EXCEPT
+  .special_bound = V4 (0x42ad496c),
+#endif
+};
+
+#if !WANT_SIMD_EXCEPT
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
+	      uint32x4_t special)
+{
+  return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
+}
+#endif
+
+/* Single-precision vector cosh, using vector expf.
+   Maximum error is 2.38 ULP:
+   _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
+				 want 0x1.6a4922p+4.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all inputs if any input is a special value or above the bound
+     at which expf overflows.  */
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, x, v_u32 (-1));
+
+  uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
+  /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
+     input to 0, which will generate no exceptions.  */
+  if (unlikely (v_any_u32 (tiny)))
+    ax = v_zerofy_f32 (ax, tiny);
+  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+#else
+  uint32x4_t special = vcageq_f32 (x, d->bound);
+  float32x4_t t = v_expf_inline (x, &d->expf_consts);
+#endif
+
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+  float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (tiny)))
+    return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
+#else
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, half_t, half_over_t, special);
+#endif
+
+  return vaddq_f32 (half_t, half_over_t);
+}
+
+HALF_WIDTH_ALIAS_F1 (cosh)
+
+TEST_SIG (V, F, 1, cosh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (cosh), 1.89)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c
new file mode 100644
index 000000000000..e63201a55786
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cospi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t poly[10];
+  float64x2_t range_val;
+} data = {
+  /* Polynomial coefficients generated using Remez algorithm,
+     see sinpi.sollya for details.  */
+  .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+	    V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+	    V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+	    V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+	    V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+  .range_val = V2 (0x1p63),
+};
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+  return v_call_f64 (arm_math_cospi, x, y, cmp);
+}
+
+/* Approximation for vector double-precision cospi(x).
+   Maximum Error 3.06 ULP:
+  _ZGVnN2v_cospi(0x1.7dd4c0b03cc66p-5) got 0x1.fa854babfb6bep-1
+				      want 0x1.fa854babfb6c1p-1.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  float64x2_t r = vabsq_f64 (x);
+  uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x);
+
+  /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
+     to avoid them overflowing and throwing exceptions.  */
+  r = v_zerofy_f64 (r, cmp);
+  uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63);
+
+#else
+  float64x2_t r = x;
+  uint64x2_t cmp = vcageq_f64 (r, d->range_val);
+  uint64x2_t odd
+      = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
+
+#endif
+
+  r = vsubq_f64 (r, vrndaq_f64 (r));
+
+  /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2.  */
+  r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r));
+
+  /* y = sin(r).  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t r4 = vmulq_f64 (r2, r2);
+  float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r);
+
+  /* Fallback to scalar.  */
+  if (unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, odd, cmp);
+
+  /* Reintroduce the sign bit for inputs which round to odd.  */
+  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_D1 (cospi), 2.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c
new file mode 100644
index 000000000000..62f4b8122b2c
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c
@@ -0,0 +1,86 @@
+/*
+ * Single-precision vector cospi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t poly[6];
+  float32x4_t range_val;
+} data = {
+  /* Taylor series coefficents for sin(pi * x).  */
+  .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+	    V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+  .range_val = V4 (0x1p31f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+  return v_call_f32 (arm_math_cospif, x, y, cmp);
+}
+
+/* Approximation for vector single-precision cospi(x)
+    Maximum Error: 3.17 ULP:
+    _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
+				  want 0x1.f7cd5p-1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  float32x4_t r = vabsq_f32 (x);
+  uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x);
+
+  /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
+     to avoid them overflowing and throwing exceptions.  */
+  r = v_zerofy_f32 (r, cmp);
+  uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31);
+
+#else
+  float32x4_t r = x;
+  uint32x4_t cmp = vcageq_f32 (r, d->range_val);
+
+  uint32x4_t odd
+      = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
+
+#endif
+
+  /* r = x - rint(x).  */
+  r = vsubq_f32 (r, vrndaq_f32 (r));
+
+  /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2.  */
+  r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r));
+
+  /* Pairwise Horner approximation for y = sin(r * pi).  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t r4 = vmulq_f32 (r2, r2);
+  float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r);
+
+  /* Fallback to scalar.  */
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, odd, cmp);
+
+  /* Reintroduce the sign bit for inputs which round to odd.  */
+  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+HALF_WIDTH_ALIAS_F1 (cospi)
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_F1 (cospi), 2.67)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c
new file mode 100644
index 000000000000..40717a660ce2
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c
@@ -0,0 +1,166 @@
+/*
+ * Double-precision vector erf(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t third;
+  float64x2_t tenth, two_over_five, two_over_nine;
+  double two_over_fifteen, two_over_fortyfive;
+  float64x2_t max, shift;
+  uint64x2_t max_idx;
+#if WANT_SIMD_EXCEPT
+  float64x2_t tiny_bound, huge_bound, scale_minus_one;
+#endif
+} data = {
+  .max_idx = V2 (768),
+  .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too.  */
+  .two_over_fifteen = 0x1.1111111111111p-3,
+  .tenth = V2 (-0x1.999999999999ap-4),
+  .two_over_five = V2 (-0x1.999999999999ap-2),
+  .two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
+  .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
+  .max = V2 (5.9921875), /* 6 - 1/128.  */
+  .shift = V2 (0x1p45),
+#if WANT_SIMD_EXCEPT
+  .huge_bound = V2 (0x1p205),
+  .tiny_bound = V2 (0x1p-226),
+  .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0.  */
+#endif
+};
+
+#define AbsMask 0x7fffffffffffffff
+
+struct entry
+{
+  float64x2_t erf;
+  float64x2_t scale;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  struct entry e;
+  float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+	      e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
+  e.erf = vuzp1q_f64 (e1, e2);
+  e.scale = vuzp2q_f64 (e1, e2);
+  return e;
+}
+
+/* Double-precision implementation of vector erf(x).
+   Approximation based on series expansion near x rounded to
+   nearest multiple of 1/128.
+   Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+   erf(x) ~ erf(r) + scale * d * [
+       + 1
+       - r d
+       + 1/3 (2 r^2 - 1) d^2
+       - 1/6 (r (2 r^2 - 3)) d^3
+       + 1/30 (4 r^4 - 12 r^2 + 3) d^4
+       - 1/90 (4 r^4 - 20 r^2 + 15) d^5
+     ]
+
+   Maximum measure error: 2.29 ULP
+   V_NAME_D1 (erf)(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8
+					 want -0x1.20dd59132ebafp-8.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+{
+  const struct data *dat = ptr_barrier (&data);
+
+  float64x2_t a = vabsq_f64 (x);
+  /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
+     to return expected results.  */
+  uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
+  uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
+
+#if WANT_SIMD_EXCEPT
+  /* |x| huge or tiny.  */
+  uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound);
+  uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound);
+  uint64x2_t cmp = vorrq_u64 (cmp1, cmp2);
+  /* If any lanes are special, mask them with 1 for small x or 8 for large
+     values and retain a copy of a to allow special case handler to fix special
+     lanes later. This is only necessary if fenv exceptions are to be triggered
+     correctly.  */
+  if (unlikely (v_any_u64 (cmp)))
+    {
+      a = vbslq_f64 (cmp1, v_f64 (8.0), a);
+      a = vbslq_f64 (cmp2, v_f64 (1.0), a);
+    }
+#endif
+
+  /* Set r to multiple of 1/128 nearest to |x|.  */
+  float64x2_t shift = dat->shift;
+  float64x2_t z = vaddq_f64 (a, shift);
+
+  /* Lookup erf(r) and scale(r) in table, without shortcut for small values,
+     but with saturated indices for large values and NaNs in order to avoid
+     segfault.  */
+  uint64x2_t i
+      = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
+  i = vbslq_u64 (a_le_max, i, dat->max_idx);
+  struct entry e = lookup (i);
+
+  float64x2_t r = vsubq_f64 (z, shift);
+
+  /* erf(x) ~ erf(r) + scale * d * poly (r, d).  */
+  float64x2_t d = vsubq_f64 (a, r);
+  float64x2_t d2 = vmulq_f64 (d, d);
+  float64x2_t r2 = vmulq_f64 (r, r);
+
+  float64x2_t two_over_fifteen_and_fortyfive
+      = vld1q_f64 (&dat->two_over_fifteen);
+
+  /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5.  */
+  float64x2_t p1 = r;
+  float64x2_t p2
+      = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
+  float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
+  float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
+				    two_over_fifteen_and_fortyfive, 0);
+  p4 = vfmsq_f64 (dat->tenth, r2, p4);
+  float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
+				    two_over_fifteen_and_fortyfive, 1);
+  p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
+
+  float64x2_t p34 = vfmaq_f64 (p3, d, p4);
+  float64x2_t p12 = vfmaq_f64 (p1, d, p2);
+  float64x2_t y = vfmaq_f64 (p34, d2, p5);
+  y = vfmaq_f64 (p12, d2, y);
+
+  y = vfmaq_f64 (e.erf, e.scale, vfmsq_f64 (d, d2, y));
+
+  /* Solves the |x| = inf and NaN cases.  */
+  y = vbslq_f64 (a_gt_max, v_f64 (1.0), y);
+
+  /* Copy sign.  */
+  y = vbslq_f64 (v_u64 (AbsMask), y, x);
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (cmp2)))
+    {
+      /* Neutralise huge values of x before fixing small values.  */
+      x = vbslq_f64 (cmp1, v_f64 (1.0), x);
+      /* Fix tiny values that trigger spurious underflow.  */
+      return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y);
+    }
+#endif
+  return y;
+}
+
+TEST_SIG (V, D, 1, erf, -6.0, 6.0)
+TEST_ULP (V_NAME_D1 (erf), 1.79)
+/* WANT_SIMD_EXCEPT blocks miss some cases.  */
+TEST_DISABLE_FENV (V_NAME_D1 (erf))
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c
new file mode 100644
index 000000000000..97ef09ecc113
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c
@@ -0,0 +1,205 @@
+/*
+ * Double-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  uint64x2_t offset, table_scale;
+  float64x2_t max, shift;
+  float64x2_t p20, p40, p41, p51;
+  double p42, p52;
+  double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
+#if WANT_SIMD_EXCEPT
+  float64x2_t uflow_bound;
+#endif
+} data = {
+  /* Set an offset so the range of the index used for lookup is 3487, and it
+     can be clamped using a saturated add on an offset index.
+     Index offset is 0xffffffffffffffff - asuint64(shift) - 3487.  */
+  .offset = V2 (0xbd3ffffffffff260),
+  .table_scale = V2 (0x37f0000000000000 << 1), /* asuint64 (2^-128) << 1.  */
+  .max = V2 (0x1.b3ep+4),		       /* 3487/128.  */
+  .shift = V2 (0x1p45),
+  .p20 = V2 (0x1.5555555555555p-2),  /* 1/3, used to compute 2/3 and 1/6.  */
+  .p40 = V2 (-0x1.999999999999ap-4), /* 1/10.  */
+  .p41 = V2 (-0x1.999999999999ap-2), /* 2/5.  */
+  .p42 = 0x1.1111111111111p-3,	     /* 2/15.  */
+  .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9.  */
+  .p52 = 0x1.6c16c16c16c17p-5,	     /* 2/45.  */
+  /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9.  */
+  .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
+  .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
+  .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 },
+  .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 },
+  .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 },
+#if WANT_SIMD_EXCEPT
+  .uflow_bound = V2 (0x1.a8b12fc6e4892p+4),
+#endif
+};
+
+#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1.  */
+#define Off 0xfffffffffffff260	     /* 0xffffffffffffffff - 3487.  */
+
+struct entry
+{
+  float64x2_t erfc;
+  float64x2_t scale;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  struct entry e;
+  float64x2_t e1
+      = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+  float64x2_t e2
+      = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
+  e.erfc = vuzp1q_f64 (e1, e2);
+  e.scale = vuzp2q_f64 (e1, e2);
+  return e;
+}
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+  return v_call_f64 (erfc, x, y, cmp);
+}
+#endif
+
+/* Optimized double-precision vector erfc(x).
+   Approximation based on series expansion near x rounded to
+   nearest multiple of 1/128.
+
+   Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+   erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+   poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+		+ (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+		- r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5
+		+ p6(r) d^6 + ... + p10(r) d^10
+
+   Polynomials p6(r) to p10(r) are computed using recurrence relation
+
+   2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0,
+   with p0 = 1, and p1(r) = -r.
+
+   Values of erfc(r) and scale are read from lookup tables. Stored values
+   are scaled to avoid hitting the subnormal range.
+
+   Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+
+   Maximum measured error: 1.71 ULP
+   V_NAME_D1 (erfc)(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608
+				       want 0x1.e15fcbea3e7adp-608.  */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
+{
+  const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  /* |x| < 2^-511. Avoid fabs by left-shifting by 1.  */
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound));
+  /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in
+     integer domain to avoid raising exceptions in presence of nans.  */
+  uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x),
+				vreinterpretq_s64_f64 (dat->uflow_bound));
+  cmp = vorrq_u64 (cmp, uflow);
+  float64x2_t xm = x;
+  /* If any lanes are special, mask them with 0 and retain a copy of x to allow
+     special case handler to fix special lanes later. This is only necessary if
+     fenv exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u64 (cmp)))
+    x = v_zerofy_f64 (x, cmp);
+#endif
+
+  float64x2_t a = vabsq_f64 (x);
+  a = vminq_f64 (a, dat->max);
+
+  /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to
+     2/sqrt(pi), when x reduced to r = 0.  */
+  float64x2_t shift = dat->shift;
+  float64x2_t z = vaddq_f64 (a, shift);
+
+  /* Clamp index to a range of 3487. A naive approach would use a subtract and
+     min. Instead we offset the table address and the index, then use a
+     saturating add.  */
+  uint64x2_t i = vqaddq_u64 (vreinterpretq_u64_f64 (z), dat->offset);
+
+  struct entry e = lookup (i);
+
+  /* erfc(x) ~ erfc(r) - scale * d * poly(r, d).  */
+  float64x2_t r = vsubq_f64 (z, shift);
+  float64x2_t d = vsubq_f64 (a, r);
+  float64x2_t d2 = vmulq_f64 (d, d);
+  float64x2_t r2 = vmulq_f64 (r, r);
+
+  float64x2_t p1 = r;
+  float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
+  float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
+  float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+  float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
+  p4 = vfmsq_f64 (dat->p40, r2, p4);
+  float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
+  p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
+  /* Compute p_i using recurrence relation:
+     p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}.  */
+  float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
+	      qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
+	      qr9 = vld1q_f64 (dat->qr9);
+  float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
+  p6 = vmulq_laneq_f64 (p6, qr5, 1);
+  float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
+  p7 = vmulq_laneq_f64 (p7, qr6, 1);
+  float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
+  p8 = vmulq_laneq_f64 (p8, qr7, 1);
+  float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
+  p9 = vmulq_laneq_f64 (p9, qr8, 1);
+  float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
+  p10 = vmulq_laneq_f64 (p10, qr9, 1);
+  /* Compute polynomial in d using pairwise Horner scheme.  */
+  float64x2_t p90 = vfmaq_f64 (p9, d, p10);
+  float64x2_t p78 = vfmaq_f64 (p7, d, p8);
+  float64x2_t p56 = vfmaq_f64 (p5, d, p6);
+  float64x2_t p34 = vfmaq_f64 (p3, d, p4);
+  float64x2_t p12 = vfmaq_f64 (p1, d, p2);
+  float64x2_t y = vfmaq_f64 (p78, d2, p90);
+  y = vfmaq_f64 (p56, d2, y);
+  y = vfmaq_f64 (p34, d2, y);
+  y = vfmaq_f64 (p12, d2, y);
+
+  y = vfmsq_f64 (e.erfc, e.scale, vfmsq_f64 (d, d2, y));
+
+  /* Offset equals 2.0 if sign, else 0.0.  */
+  uint64x2_t sign = vshrq_n_u64 (vreinterpretq_u64_f64 (x), 63);
+  float64x2_t off = vreinterpretq_f64_u64 (vshlq_n_u64 (sign, 62));
+  /* Copy sign and scale back in a single fma. Since the bit patterns do not
+     overlap, then logical or and addition are equivalent here.  */
+  float64x2_t fac = vreinterpretq_f64_u64 (
+      vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1));
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (cmp)))
+    return special_case (xm, vfmaq_f64 (off, fac, y), cmp);
+#endif
+
+  return vfmaq_f64 (off, fac, y);
+}
+
+TEST_SIG (V, D, 1, erfc, -6.0, 28.0)
+TEST_ULP (V_NAME_D1 (erfc), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c
new file mode 100644
index 000000000000..f420439ef8a3
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c
@@ -0,0 +1,174 @@
+/*
+ * Single-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  uint32x4_t offset, table_scale;
+  float32x4_t max, shift;
+  float coeffs[4];
+  float32x4_t third, two_over_five, tenth;
+#if WANT_SIMD_EXCEPT
+  float32x4_t uflow_bound;
+#endif
+
+} data = {
+  /* Set an offset so the range of the index used for lookup is 644, and it can
+     be clamped using a saturated add.  */
+  .offset = V4 (0xb7fffd7b),	       /* 0xffffffff - asuint(shift) - 644.  */
+  .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1.  */
+  .max = V4 (10.0625f),		       /* 10 + 1/16 = 644/64.  */
+  .shift = V4 (0x1p17f),
+  /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
+     fmas.  */
+  .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+  .third = V4 (0x1.555556p-2f),
+  .two_over_five = V4 (-0x1.99999ap-2f),
+  .tenth = V4 (-0x1.99999ap-4f),
+#if WANT_SIMD_EXCEPT
+  .uflow_bound = V4 (0x1.2639cp+3f),
+#endif
+};
+
+#define TinyBound 0x41000000 /* 0x1p-62f << 1.  */
+#define Thres 0xbe000000     /* asuint(infinity) << 1 - TinyBound.  */
+#define Off 0xfffffd7b	     /* 0xffffffff - 644.  */
+
+struct entry
+{
+  float32x4_t erfc;
+  float32x4_t scale;
+};
+
+static inline struct entry
+lookup (uint32x4_t i)
+{
+  struct entry e;
+  float32x2_t t0
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+  float32x2_t t1
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+  float32x2_t t2
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+  float32x2_t t3
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+  float32x4_t e1 = vcombine_f32 (t0, t1);
+  float32x4_t e2 = vcombine_f32 (t2, t3);
+  e.erfc = vuzp1q_f32 (e1, e2);
+  e.scale = vuzp2q_f32 (e1, e2);
+  return e;
+}
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  return v_call_f32 (erfcf, x, y, cmp);
+}
+#endif
+
+/* Optimized single-precision vector erfcf(x).
+   Approximation based on series expansion near x rounded to
+   nearest multiple of 1/64.
+   Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+   erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+   poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+		+ (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+
+   Values of erfc(r) and scale are read from lookup tables. Stored values
+   are scaled to avoid hitting the subnormal range.
+
+   Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+   Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
+   _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
+				want 0x1.f51216p-120.  */
+NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
+{
+  const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  /* |x| < 2^-62. Avoid fabs by left-shifting by 1.  */
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound));
+  /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in
+     integer domain to avoid raising exceptions in presence of nans.  */
+  uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x),
+				vreinterpretq_s32_f32 (dat->uflow_bound));
+  cmp = vorrq_u32 (cmp, uflow);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 0 and retain a copy of x to allow
+     special case handler to fix special lanes later. This is only necessary if
+     fenv exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = v_zerofy_f32 (x, cmp);
+#endif
+
+  float32x4_t a = vabsq_f32 (x);
+  a = vminq_f32 (a, dat->max);
+
+  /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to
+     2/sqrt(pi), when x reduced to r = 0.  */
+  float32x4_t shift = dat->shift;
+  float32x4_t z = vaddq_f32 (a, shift);
+
+  /* Clamp index to a range of 644. A naive approach would use a subtract and
+     min. Instead we offset the table address and the index, then use a
+     saturating add.  */
+  uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset);
+
+  struct entry e = lookup (i);
+
+  /* erfc(x) ~ erfc(r) - scale * d * poly(r, d).  */
+  float32x4_t r = vsubq_f32 (z, shift);
+  float32x4_t d = vsubq_f32 (a, r);
+  float32x4_t d2 = vmulq_f32 (d, d);
+  float32x4_t r2 = vmulq_f32 (r, r);
+
+  float32x4_t p1 = r;
+  float32x4_t coeffs = vld1q_f32 (dat->coeffs);
+  float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
+  float32x4_t p3
+      = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
+  float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
+  p4 = vfmsq_f32 (dat->tenth, r2, p4);
+
+  float32x4_t y = vfmaq_f32 (p3, d, p4);
+  y = vfmaq_f32 (p2, d, y);
+  y = vfmaq_f32 (p1, d, y);
+  y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y));
+
+  /* Offset equals 2.0f if sign, else 0.0f.  */
+  uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31);
+  float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30));
+  /* Copy sign and scale back in a single fma. Since the bit patterns do not
+     overlap, then logical or and addition are equivalent here.  */
+  float32x4_t fac = vreinterpretq_f32_u32 (
+      vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1));
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (xm, vfmaq_f32 (off, fac, y), cmp);
+#endif
+
+  return vfmaq_f32 (off, fac, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (erfc)
+
+TEST_SIG (V, F, 1, erfc, -4.0, 10.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT)
+TEST_ULP (V_NAME_F1 (erfc), 1.14)
+TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c
new file mode 100644
index 000000000000..508bc4c2f5e2
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c
@@ -0,0 +1,120 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t max, shift, third;
+#if WANT_SIMD_EXCEPT
+  float32x4_t tiny_bound, scale_minus_one;
+#endif
+} data = {
+  .max = V4 (3.9375), /* 4 - 8/128.  */
+  .shift = V4 (0x1p16f),
+  .third = V4 (0x1.555556p-2f), /* 1/3.  */
+#if WANT_SIMD_EXCEPT
+  .tiny_bound = V4 (0x1p-62f),
+  .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0.  */
+#endif
+};
+
+#define AbsMask 0x7fffffff
+
+struct entry
+{
+  float32x4_t erf;
+  float32x4_t scale;
+};
+
+static inline struct entry
+lookup (uint32x4_t i)
+{
+  struct entry e;
+  float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+  float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+  float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+  float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+  float32x4_t e1 = vcombine_f32 (t0, t1);
+  float32x4_t e2 = vcombine_f32 (t2, t3);
+  e.erf = vuzp1q_f32 (e1, e2);
+  e.scale = vuzp2q_f32 (e1, e2);
+  return e;
+}
+
+/* Single-precision implementation of vector erf(x).
+   Approximation based on series expansion near x rounded to
+   nearest multiple of 1/128.
+   Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+   erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2]
+
+   Values of erf(r) and scale are read from lookup tables.
+   For |x| > 3.9375, erf(|x|) rounds to 1.0f.
+
+   Maximum error: 1.93 ULP
+     _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
+				 want 0x1.fd6868p-9.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
+{
+  const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  /* |x| < 2^-62.  */
+  uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special case handler to fix special lanes later. This is only necessary if
+     fenv exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+  float32x4_t a = vabsq_f32 (x);
+  uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max);
+
+  /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to
+     2/sqrt(pi), when x reduced to r = 0.  */
+  float32x4_t shift = dat->shift;
+  float32x4_t z = vaddq_f32 (a, shift);
+
+  uint32x4_t i
+      = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift));
+  i = vminq_u32 (i, v_u32 (512));
+  struct entry e = lookup (i);
+
+  float32x4_t r = vsubq_f32 (z, shift);
+
+  /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2).  */
+  float32x4_t d = vsubq_f32 (a, r);
+  float32x4_t d2 = vmulq_f32 (d, d);
+  float32x4_t y = vfmaq_f32 (r, dat->third, d);
+  y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y));
+
+  /* Solves the |x| = inf case.  */
+  y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y);
+
+  /* Copy sign.  */
+  y = vbslq_f32 (v_u32 (AbsMask), y, x);
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (cmp)))
+    return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y);
+#endif
+  return y;
+}
+
+HALF_WIDTH_ALIAS_F1 (erf)
+
+TEST_SIG (V, F, 1, erf, -4.0, 4.0)
+TEST_ULP (V_NAME_F1 (erf), 1.43)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c
new file mode 100644
index 000000000000..a928c35c9418
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c
@@ -0,0 +1,134 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static volatile struct
+{
+  float64x2_t poly[3];
+  float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+#if !WANT_SIMD_EXCEPT
+  float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.88 +0.5 ulp
+     rel error: 1.4337*2^-53
+     abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
+  .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
+	    V2 (0x1.55555da646206p-5) },
+#if !WANT_SIMD_EXCEPT
+  .scale_thresh = V2 (163840.0), /* 1280.0 * N.  */
+  .special_bound = V2 (704.0),
+#endif
+  .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2.  */
+  .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N.  */
+  .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
+  .shift = V2 (0x1.8p+52)
+};
+
+#define C(i) data.poly[i]
+#define Tab __v_exp_data
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511).  */
+# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9).  */
+# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound.  */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769.  */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254.  */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n)
+{
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (
+      vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+  uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
+  float64x2_t r1 = vmulq_f64 (s1, s1);
+  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+  return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
+{
+  float64x2_t n, r, r2, s, y, z;
+  uint64x2_t cmp, u, e;
+
+#if WANT_SIMD_EXCEPT
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special_case to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  float64x2_t xm = x;
+  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
+  if (unlikely (v_any_u64 (cmp)))
+    x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+  cmp = vcagtq_f64 (x, data.special_bound);
+#endif
+
+  /* n = round(x/(ln2/N)).  */
+  z = vfmaq_f64 (data.shift, x, data.inv_ln2);
+  u = vreinterpretq_u64_f64 (z);
+  n = vsubq_f64 (z, data.shift);
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = vfmsq_f64 (r, data.ln2_hi, n);
+  r = vfmsq_f64 (r, data.ln2_lo, n);
+
+  e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4.  */
+  r2 = vmulq_f64 (r, r);
+  y = vfmaq_f64 (C (0), C (1), r);
+  y = vfmaq_f64 (y, C (2), r2);
+  y = vfmaq_f64 (r, y, r2);
+
+  /* s = 2^(n/N).  */
+  u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
+  s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+    return special_case (s, y, n);
+#endif
+
+  return vfmaq_f64 (s, y, s);
+}
+
+TEST_SIG (V, D, 1, exp, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp), 1.9)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c
new file mode 100644
index 000000000000..24fdd1c7d257
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c
@@ -0,0 +1,147 @@
+/*
+ * Double-precision vector 10^x function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1.  */
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 163840.0 /* 1280.0 * N.  */
+
+const static struct data
+{
+  float64x2_t poly[4];
+  float64x2_t log10_2, log2_10_hi, log2_10_lo, shift;
+#if !WANT_SIMD_EXCEPT
+  float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* Coefficients generated using Remez algorithm.
+     rel error: 0x1.5ddf8f28p-54
+     abs error: 0x1.5ed266c8p-54 in [ -log10(2)/256, log10(2)/256 ]
+     maxerr: 1.14432 +0.5 ulp.  */
+  .poly = { V2 (0x1.26bb1bbb5524p1), V2 (0x1.53524c73cecdap1),
+	    V2 (0x1.047060efb781cp1), V2 (0x1.2bd76040f0d16p0) },
+  .log10_2 = V2 (0x1.a934f0979a371p8),	   /* N/log2(10).  */
+  .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N.  */
+  .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66),
+  .shift = V2 (0x1.8p+52),
+#if !WANT_SIMD_EXCEPT
+  .scale_thresh = V2 (ScaleBound),
+  .special_bound = V2 (SpecialBound),
+#endif
+};
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask v_u64 (N - 1)
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511).  */
+# define BigBound v_u64 (0x4070000000000000)  /* asuint64 (0x1p8).  */
+# define Thres v_u64 (0x2070000000000000)     /* BigBound - TinyBound.  */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine for special lanes.  */
+  return v_call_f64 (exp10, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+# define SpecialBias1 v_u64 (0x7000000000000000)  /* 0x1p769.  */
+# define SpecialBias2 v_u64 (0x3010000000000000)  /* 0x1p-254.  */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+	      const struct data *d)
+{
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (
+      vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+  uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh);
+  float64x2_t r1 = vmulq_f64 (s1, s1);
+  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+  return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+/* Fast vector implementation of exp10.
+   Maximum measured error is 1.64 ulp.
+   _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5
+				       want 0x1.f8dab6d7fed0ap+5.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t cmp;
+#if WANT_SIMD_EXCEPT
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special_case to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  float64x2_t xm = x;
+  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres);
+  if (unlikely (v_any_u64 (cmp)))
+    x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+  cmp = vcageq_f64 (x, d->special_bound);
+#endif
+
+  /* n = round(x/(log10(2)/N)).  */
+  float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
+  uint64x2_t u = vreinterpretq_u64_f64 (z);
+  float64x2_t n = vsubq_f64 (z, d->shift);
+
+  /* r = x - n*log10(2)/N.  */
+  float64x2_t r = x;
+  r = vfmsq_f64 (r, d->log2_10_hi, n);
+  r = vfmsq_f64 (r, d->log2_10_lo, n);
+
+  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+  uint64x2_t i = vandq_u64 (u, IndexMask);
+
+  /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t p = vfmaq_f64 (d->poly[0], r, d->poly[1]);
+  float64x2_t y = vfmaq_f64 (d->poly[2], r, d->poly[3]);
+  p = vfmaq_f64 (p, y, r2);
+  y = vmulq_f64 (r, p);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (__v_exp_data, i);
+  float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+    return special_case (s, y, n, d);
+#endif
+
+  return vfmaq_f64 (s, y, s);
+}
+
+#if WANT_EXP10_TESTS
+TEST_SIG (S, D, 1, exp10, -9.9, 9.9)
+TEST_SIG (V, D, 1, exp10, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp10), 1.15)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c
new file mode 100644
index 000000000000..eb0d5dd0d57c
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c
@@ -0,0 +1,147 @@
+/*
+ * Single-precision vector 10^x function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
+
+#define ScaleBound 192.0f
+
+static const struct data
+{
+  float32x4_t c0, c1, c3;
+  float log10_2_high, log10_2_low, c2, c4;
+  float32x4_t inv_log10_2, special_bound;
+  uint32x4_t exponent_bias, special_offset, special_bias;
+#if !WANT_SIMD_EXCEPT
+  float32x4_t scale_thresh;
+#endif
+} data = {
+  /* Coefficients generated using Remez algorithm with minimisation of relative
+     error.
+     rel error: 0x1.89dafa3p-24
+     abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+     maxerr: 1.85943 +0.5 ulp.  */
+  .c0 = V4 (0x1.26bb16p+1f),
+  .c1 = V4 (0x1.5350d2p+1f),
+  .c2 = 0x1.04744ap+1f,
+  .c3 = V4 (0x1.2d8176p+0f),
+  .c4 = 0x1.12b41ap-1f,
+  .inv_log10_2 = V4 (0x1.a934fp+1),
+  .log10_2_high = 0x1.344136p-2,
+  .log10_2_low = 0x1.ec10cp-27,
+  /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+  .special_bound = V4 (126.0f),
+  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+#if !WANT_SIMD_EXCEPT
+  .scale_thresh = V4 (ScaleBound)
+#endif
+};
+
+#if WANT_SIMD_EXCEPT
+
+# define SpecialBound 38.0f	       /* rint(log10(2^127)).  */
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63).  */
+# define BigBound v_u32 (0x42180000)  /* asuint (SpecialBound).  */
+# define Thres v_u32 (0x22180000)     /* BigBound - TinyBound.  */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (exp10f, x, y, cmp);
+}
+
+#else
+
+# define SpecialBound 126.0f
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+/* Fast vector implementation of single-precision exp10.
+   Algorithm is accurate to 2.36 ULP.
+   _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
+				 want 0x1.7e79cp+11.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+#if WANT_SIMD_EXCEPT
+  /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
+  uint32x4_t cmp = vcgeq_u32 (
+      vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special case handler to fix special lanes later. This is only necessary if
+     fenv exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = v_zerofy_f32 (x, cmp);
+#endif
+
+  /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+     with poly(r) in [1/sqrt(2), sqrt(2)] and
+     x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2].  */
+  float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+  float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
+  r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
+
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
+  float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
+  float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
+  float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
+
+  if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+    return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+  return vfmaq_f32 (scale, poly, scale);
+}
+
+HALF_WIDTH_ALIAS_F1 (exp10)
+
+#if WANT_EXP10_TESTS
+TEST_SIG (S, F, 1, exp10, -9.9, 9.9)
+TEST_SIG (V, F, 1, exp10, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp10), 1.86)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c
new file mode 100644
index 000000000000..63448d806b82
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c
@@ -0,0 +1,128 @@
+/*
+ * Double-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+#define BigBound 1022.0
+#define UOFlowBound 1280.0
+#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511).  */
+
+static const struct data
+{
+  float64x2_t poly[4];
+  float64x2_t shift, scale_big_bound, scale_uoflow_bound;
+} data = {
+  /* Coefficients are computed using Remez algorithm with
+     minimisation of the absolute error.  */
+  .poly = { V2 (0x1.62e42fefa3686p-1), V2 (0x1.ebfbdff82c241p-3),
+	    V2 (0x1.c6b09b16de99ap-5), V2 (0x1.3b2abf5571ad8p-7) },
+  .shift = V2 (0x1.8p52 / N),
+  .scale_big_bound = V2 (BigBound),
+  .scale_uoflow_bound = V2 (UOFlowBound),
+};
+
+static inline uint64x2_t
+lookup_sbits (uint64x2_t i)
+{
+  return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
+		       __v_exp_data[i[1] & IndexMask] };
+}
+
+#if WANT_SIMD_EXCEPT
+
+# define Thres 0x2080000000000000     /* asuint64(512.0) - TinyBound.  */
+
+/* Call scalar exp2 as a fallback.  */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
+{
+  return v_call_f64 (exp2, x, y, is_special);
+}
+
+#else
+
+# define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+# define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+# define SpecialBias2 0x3010000000000000 /* 0x1p-254.  */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+	      const struct data *d)
+{
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset));
+  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
+  uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound);
+  float64x2_t r1 = vmulq_f64 (s1, s1);
+  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1);
+  return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+/* Fast vector implementation of exp2.
+   Maximum measured error is 1.65 ulp.
+   _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
+				       want 0x1.f8db0d4df721dp-1.  */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t cmp;
+#if WANT_SIMD_EXCEPT
+  uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
+  /* Mask special lanes and retain a copy of x for passing to special-case
+     handler.  */
+  float64x2_t xc = x;
+  x = v_zerofy_f64 (x, cmp);
+#else
+  cmp = vcagtq_f64 (x, d->scale_big_bound);
+#endif
+
+  /* n = round(x/N).  */
+  float64x2_t z = vaddq_f64 (d->shift, x);
+  uint64x2_t u = vreinterpretq_u64_f64 (z);
+  float64x2_t n = vsubq_f64 (z, d->shift);
+
+  /* r = x - n/N.  */
+  float64x2_t r = vsubq_f64 (x, n);
+
+  /* s = 2^(n/N).  */
+  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+  u = lookup_sbits (u);
+  float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  /* y ~ exp2(r) - 1.  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly);
+  y = vmulq_f64 (r, y);
+
+  if (unlikely (v_any_u64 (cmp)))
+#if !WANT_SIMD_EXCEPT
+    return special_case (s, y, n, d);
+#else
+    return special_case (xc, vfmaq_f64 (s, s, y), cmp);
+#endif
+  return vfmaq_f64 (s, s, y);
+}
+
+TEST_SIG (V, D, 1, exp2, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp2), 1.15)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c
new file mode 100644
index 000000000000..40f6170d3702
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c
@@ -0,0 +1,122 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  float32x4_t c1, c3;
+  uint32x4_t exponent_bias, special_offset, special_bias;
+#if !WANT_SIMD_EXCEPT
+  float32x4_t scale_thresh, special_bound;
+#endif
+  float c0, c2, c4, zero;
+} data = {
+  /* maxerr: 1.962 ulp.  */
+  .c0 = 0x1.59977ap-10f,
+  .c1 = V4 (0x1.3ce9e4p-7f),
+  .c2 = 0x1.c6bd32p-5f,
+  .c3 = V4 (0x1.ebf9bcp-3f),
+  .c4 = 0x1.62e422p-1f,
+  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+#if !WANT_SIMD_EXCEPT
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000)	  /* asuint (0x1p-63).  */
+# define BigBound v_u32 (0x42800000)	  /* asuint (0x1p6).  */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine for special lanes.  */
+  return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  /* asuint(|x|) - TinyBound >= BigBound - TinyBound.  */
+  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special_case to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+  float32x4_t n = vrndaq_f32 (x);
+  float32x4_t r = vsubq_f32 (x, n);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+  float32x4_t c024 = vld1q_f32 (&d->c0);
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_laneq_f32 (r, c024, 2);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);
+
+  if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+    return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+  return vfmaq_f32 (scale, poly, scale);
+}
+
+HALF_WIDTH_ALIAS_F1 (exp2)
+
+TEST_SIG (V, F, 1, exp2, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp2), 1.49)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c
new file mode 100644
index 000000000000..1f8e89ab658f
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c
@@ -0,0 +1,73 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t c0, c1, c2, c3, c4, c5, shift;
+  uint32x4_t exponent_bias;
+  float32x4_t special_bound, scale_thresh;
+  uint32x4_t special_offset, special_bias;
+} data = {
+  .shift = V4 (0x1.8p23f),
+  .exponent_bias = V4 (0x3f800000),
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+  /*  maxerr: 0.878 ulp.  */
+  .c0 = V4 (0x1.416b5ep-13f),
+  .c1 = V4 (0x1.5f082ep-10f),
+  .c2 = V4 (0x1.3b2dep-7f),
+  .c3 = V4 (0x1.c6af7cp-5f),
+  .c4 = V4 (0x1.ebfbdcp-3f),
+  .c5 = V4 (0x1.62e43p-1f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r1 = vmulq_f32 (s1, s1);
+  float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
+  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+				| (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
+{
+  /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n = vrndaq_f32 (x);
+  float32x4_t r = x - n;
+  uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
+  float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+
+  float32x4_t p = vfmaq_f32 (d->c1, d->c0, r);
+  p = vfmaq_f32 (d->c2, p, r);
+  p = vfmaq_f32 (d->c3, p, r);
+  p = vfmaq_f32 (d->c4, p, r);
+  p = vfmaq_f32 (d->c5, p, r);
+  p = vfmaq_f32 (v_f32 (1.0f), p, r);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (p, n, e, d);
+  return scale * p;
+}
+
+TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4)
+TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u)
+TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c
new file mode 100644
index 000000000000..e5b1f020d1a0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c
@@ -0,0 +1,130 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  float32x4_t c1, c3, c4, inv_ln2;
+  float ln2_hi, ln2_lo, c0, c2;
+  uint32x4_t exponent_bias, special_offset, special_bias;
+#if !WANT_SIMD_EXCEPT
+  float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.45358 +0.5 ulp.  */
+  .c0 = 0x1.0e4020p-7f,
+  .c1 = V4 (0x1.573e2ep-5f),
+  .c2 = 0x1.555e66p-3f,
+  .c3 = V4 (0x1.fffdb6p-2f),
+  .c4 = V4 (0x1.ffffecp-1f),
+  .inv_ln2 = V4 (0x1.715476p+0f),
+  .ln2_hi = 0x1.62e4p-1f,
+  .ln2_lo = 0x1.7f7d1cp-20f,
+  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+#if !WANT_SIMD_EXCEPT
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000)	/* asuint (0x1p-63).  */
+# define BigBound v_u32 (0x42800000)	/* asuint (0x1p6).  */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  // (s2 + p*s2)*s1 = s2(p+1)s1
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+
+#if WANT_SIMD_EXCEPT
+  /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
+  uint32x4_t cmp = vcgeq_u32 (
+      vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
+		 TinyBound),
+      SpecialBound);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special case handler to fix special lanes later. This is only necessary if
+     fenv exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+  float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
+  r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (d->c4, r);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);
+
+  if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+    return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+  return vfmaq_f32 (scale, poly, scale);
+}
+
+HALF_WIDTH_ALIAS_F1 (exp)
+
+TEST_SIG (V, F, 1, exp, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp), 1.49)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c
new file mode 100644
index 000000000000..4e114d810e08
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c
@@ -0,0 +1,79 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t shift, inv_ln2;
+  uint32x4_t exponent_bias;
+  float32x4_t c1, c2, c3, c4;
+  float32x4_t special_bound, scale_thresh;
+  uint32x4_t special_offset, special_bias;
+  float ln2_hi, ln2_lo, c0, nothing;
+} data = {
+  .ln2_hi = 0x1.62e4p-1f,
+  .ln2_lo = 0x1.7f7d1cp-20f,
+  .shift = V4 (0x1.8p23f),
+  .inv_ln2 = V4 (0x1.715476p+0f),
+  .exponent_bias = V4 (0x3f800000),
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+  .special_offset = V4 (0x83000000),
+  .special_bias = V4 (0x7f000000),
+  /*  maxerr: 0.36565 +0.5 ulp.  */
+  .c0 = 0x1.6a6000p-10f,
+  .c1 = V4 (0x1.12718ep-7f),
+  .c2 = V4 (0x1.555af0p-5f),
+  .c3 = V4 (0x1.555430p-3f),
+  .c4 = V4 (0x1.fffff4p-2f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r1 = vmulq_f32 (s1, s1);
+  float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
+  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+				| (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_expf_1u (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi);
+
+  /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  float32x4_t z = vmulq_f32 (x, d->inv_ln2);
+  float32x4_t n = vrndaq_f32 (z);
+  float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0);
+  r = vfmsq_laneq_f32 (r, n, ln2_c0, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2);
+  p = vfmaq_f32 (d->c2, p, r);
+  p = vfmaq_f32 (d->c3, p, r);
+  p = vfmaq_f32 (d->c4, p, r);
+  p = vfmaq_f32 (v_f32 (1.0f), p, r);
+  p = vfmaq_f32 (v_f32 (1.0f), p, r);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (p, n, e, d);
+  return scale * p;
+}
+
+TEST_ULP (_ZGVnN4v_expf_1u, 0.4)
+TEST_DISABLE_FENV (_ZGVnN4v_expf_1u)
+TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c
new file mode 100644
index 000000000000..7535a1830427
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c
@@ -0,0 +1,77 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+  struct v_expm1_data d;
+#if WANT_SIMD_EXCEPT
+  uint64x2_t thresh, tiny_bound;
+#else
+  float64x2_t oflow_bound;
+#endif
+} data = {
+  .d = V_EXPM1_DATA,
+#if WANT_SIMD_EXCEPT
+  /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
+     compare.  */
+  .thresh = V2 (0x78c56fa6d34b552),
+  /* asuint64(0x1p-51) << 1.  */
+  .tiny_bound = V2 (0x3cc0000000000000 << 1),
+#else
+  /* Value above which expm1(x) should overflow. Absolute value of the
+     underflow bound is greater than this, so it catches both cases - there is
+     a small window where fallbacks are triggered unnecessarily.  */
+  .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
+#endif
+};
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, const struct data *d)
+{
+  return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
+		     special);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+   The maximum error observed error is 2.05 ULP:
+  _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2
+				      want 0x1.a8897eef87b32p-2.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  /* If fp exceptions are to be triggered correctly, fall back to scalar for
+     |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+     shift-left by 1, and compare with thresh which was left-shifted offline -
+     this is effectively an absolute compare.  */
+  uint64x2_t special
+      = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
+#else
+  /* Large input, NaNs and Infs.  */
+  uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
+#endif
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, special, d);
+
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return expm1_inline (x, &d->d);
+}
+
+TEST_SIG (V, D, 1, expm1, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (expm1), 1.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c
new file mode 100644
index 000000000000..6d4431dcd8a5
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+  struct v_expm1f_data d;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t thresh;
+#else
+  float32x4_t oflow_bound;
+#endif
+} data = {
+  .d = V_EXPM1F_DATA,
+#if !WANT_SIMD_EXCEPT
+  /* Value above which expm1f(x) should overflow. Absolute value of the
+     underflow bound is greater than this, so it catches both cases - there is
+     a small window where fallbacks are triggered unnecessarily.  */
+  .oflow_bound = V4 (0x1.5ebc4p+6),
+#else
+  /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
+     compare.  */
+  .thresh = V4 (0x1d5ebc40),
+#endif
+};
+
+/* asuint(0x1p-23), shifted by 1 for abs compare.  */
+#define TinyBound v_u32 (0x34000000 << 1)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
+{
+  return v_call_f32 (
+      expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
+}
+
+/* Single-precision vector exp(x) - 1 function.
+   The maximum error is 1.62 ULP:
+   _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+				want 0x1.da9f44p-2.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  /* If fp exceptions are to be triggered correctly, fall back to scalar for
+     |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+     shift-left by 1, and compare with thresh which was left-shifted offline -
+     this is effectively an absolute compare.  */
+  uint32x4_t special
+      = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
+#else
+  /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf.  */
+  uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
+#endif
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, special, d);
+
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return expm1f_inline (x, &d->d);
+}
+
+HALF_WIDTH_ALIAS_F1 (expm1)
+
+TEST_SIG (V, F, 1, expm1, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (expm1), 1.13)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000)
+TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h
new file mode 100644
index 000000000000..0c8350a1a77b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h
@@ -0,0 +1,361 @@
+/*
+ * Double-precision x^y function.
+ *
+ * Copyright (c) 2018-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Scalar version of pow used for fallbacks in vector implementations.  */
+
+/* Data is defined in v_pow_log_data.c.  */
+#define N_LOG (1 << V_POW_LOG_TABLE_BITS)
+#define Off 0x3fe6955500000000
+#define As __v_pow_log_data.poly
+
+/* Data is defined in v_pow_exp_data.c.  */
+#define N_EXP (1 << V_POW_EXP_TABLE_BITS)
+#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
+#define SmallExp 0x3c9 /* top12(0x1p-54).  */
+#define BigExp 0x408   /* top12(512.0).  */
+#define ThresExp 0x03f /* BigExp - SmallExp.  */
+#define InvLn2N __v_pow_exp_data.n_over_ln2
+#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi
+#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo
+#define SBits __v_pow_exp_data.sbits
+#define Cs __v_pow_exp_data.poly
+
+/* Constants associated with pow.  */
+#define SmallPowX 0x001 /* top12(0x1p-126).  */
+#define BigPowX 0x7ff	/* top12(INFINITY).  */
+#define ThresPowX 0x7fe /* BigPowX - SmallPowX.  */
+#define SmallPowY 0x3be /* top12(0x1.e7b6p-65).  */
+#define BigPowY 0x43e	/* top12(0x1.749p62).  */
+#define ThresPowY 0x080 /* BigPowY - SmallPowY.  */
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t
+top12 (double x)
+{
+  return asuint64 (x) >> 52;
+}
+
+/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
+   additional 15 bits precision.  IX is the bit representation of x, but
+   normalized in the subnormal range using the sign bit for the exponent.  */
+static inline double
+log_inline (uint64_t ix, double *tail)
+{
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  uint64_t tmp = ix - Off;
+  int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1);
+  int k = (int64_t) tmp >> 52; /* arithmetic shift.  */
+  uint64_t iz = ix - (tmp & 0xfffULL << 52);
+  double z = asdouble (iz);
+  double kd = (double) k;
+
+  /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
+  double invc = __v_pow_log_data.invc[i];
+  double logc = __v_pow_log_data.logc[i];
+  double logctail = __v_pow_log_data.logctail[i];
+
+  /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
+     |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
+  double r = fma (z, invc, -1.0);
+
+  /* k*Ln2 + log(c) + r.  */
+  double t1 = kd * __v_pow_log_data.ln2_hi + logc;
+  double t2 = t1 + r;
+  double lo1 = kd * __v_pow_log_data.ln2_lo + logctail;
+  double lo2 = t1 - t2 + r;
+
+  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  double ar = As[0] * r;
+  double ar2 = r * ar;
+  double ar3 = r * ar2;
+  /* k*Ln2 + log(c) + r + A[0]*r*r.  */
+  double hi = t2 + ar2;
+  double lo3 = fma (ar, r, -ar2);
+  double lo4 = t2 - hi + ar2;
+  /* p = log1p(r) - r - A[0]*r*r.  */
+  double p = (ar3
+	      * (As[1] + r * As[2]
+		 + ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6]))));
+  double lo = lo1 + lo2 + lo3 + lo4 + p;
+  double y = hi + lo;
+  *tail = hi - y + lo;
+  return y;
+}
+
+/* Handle cases that may overflow or underflow when computing the result that
+   is scale*(1+TMP) without intermediate rounding.  The bit representation of
+   scale is in SBITS, however it has a computed exponent that may have
+   overflown into the sign bit so that needs to be adjusted before using it as
+   a double.  (int32_t)KI is the k used in the argument reduction and exponent
+   adjustment of scale, positive k here means the result may overflow and
+   negative k means the result may underflow.  */
+static inline double
+special_case (double tmp, uint64_t sbits, uint64_t ki)
+{
+  double scale, y;
+
+  if ((ki & 0x80000000) == 0)
+    {
+      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
+      sbits -= 1009ull << 52;
+      scale = asdouble (sbits);
+      y = 0x1p1009 * (scale + scale * tmp);
+      return y;
+    }
+  /* k < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  /* Note: sbits is signed scale.  */
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+#if WANT_SIMD_EXCEPT
+  if (fabs (y) < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double hi, lo, one = 1.0;
+      if (y < 0.0)
+	one = -1.0;
+      lo = scale - y + scale * tmp;
+      hi = one + y;
+      lo = one - hi + y + lo;
+      y = (hi + lo) - one;
+      /* Fix the sign of 0.  */
+      if (y == 0.0)
+	y = asdouble (sbits & 0x8000000000000000);
+      /* The underflow exception needs to be signaled explicitly.  */
+      force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+    }
+#endif
+  y = 0x1p-1022 * y;
+  return y;
+}
+
+/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+   The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1.  */
+static inline double
+exp_inline (double x, double xtail, uint32_t sign_bias)
+{
+  uint32_t abstop = top12 (x) & 0x7ff;
+  if (unlikely (abstop - SmallExp >= ThresExp))
+    {
+      if (abstop - SmallExp >= 0x80000000)
+	{
+	  /* Avoid spurious underflow for tiny x.  */
+	  /* Note: 0 is common input.  */
+	  return sign_bias ? -1.0 : 1.0;
+	}
+      if (abstop >= top12 (1024.0))
+	{
+	  /* Note: inf and nan are already handled.  */
+	  /* Skip errno handling.  */
+#if WANT_SIMD_EXCEPT
+	  return asuint64 (x) >> 63 ? __math_uflow (sign_bias)
+				    : __math_oflow (sign_bias);
+#else
+	  double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY;
+	  return sign_bias ? -res_uoflow : res_uoflow;
+#endif
+	}
+      /* Large x is special cased below.  */
+      abstop = 0;
+    }
+
+  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+  double z = InvLn2N * x;
+  double kd = round (z);
+  uint64_t ki = lround (z);
+  double r = x - kd * Ln2HiN - kd * Ln2LoN;
+  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+  r += xtail;
+  /* 2^(k/N) ~= scale.  */
+  uint64_t idx = ki & (N_EXP - 1);
+  uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS);
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  uint64_t sbits = SBits[idx] + top;
+  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
+  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  double r2 = r * r;
+  double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]);
+  if (unlikely (abstop == 0))
+    return special_case (tmp, sbits, ki);
+  double scale = asdouble (sbits);
+  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+     is no spurious underflow here even without fma.  */
+  return scale + scale * tmp;
+}
+
+/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+   A version of exp_inline that is not inlined and for which sign_bias is
+   equal to 0.  */
+static double NOINLINE
+exp_nosignbias (double x, double xtail)
+{
+  uint32_t abstop = top12 (x) & 0x7ff;
+  if (unlikely (abstop - SmallExp >= ThresExp))
+    {
+      /* Avoid spurious underflow for tiny x.  */
+      if (abstop - SmallExp >= 0x80000000)
+	return 1.0;
+      /* Note: inf and nan are already handled.  */
+      if (abstop >= top12 (1024.0))
+#if WANT_SIMD_EXCEPT
+	return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0);
+#else
+	return asuint64 (x) >> 63 ? 0.0 : INFINITY;
+#endif
+      /* Large x is special cased below.  */
+      abstop = 0;
+    }
+
+  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+  /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N].  */
+  double z = InvLn2N * x;
+  double kd = round (z);
+  uint64_t ki = lround (z);
+  double r = x - kd * Ln2HiN - kd * Ln2LoN;
+  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+  r += xtail;
+  /* 2^(k/N) ~= scale.  */
+  uint64_t idx = ki & (N_EXP - 1);
+  uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS);
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  uint64_t sbits = SBits[idx] + top;
+  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
+  double r2 = r * r;
+  double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]);
+  if (unlikely (abstop == 0))
+    return special_case (tmp, sbits, ki);
+  double scale = asdouble (sbits);
+  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+     is no spurious underflow here even without fma.  */
+  return scale + scale * tmp;
+}
+
+/* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
+   the bit representation of a non-zero finite floating-point value.  */
+static inline int
+checkint (uint64_t iy)
+{
+  int e = iy >> 52 & 0x7ff;
+  if (e < 0x3ff)
+    return 0;
+  if (e > 0x3ff + 52)
+    return 2;
+  if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
+    return 0;
+  if (iy & (1ULL << (0x3ff + 52 - e)))
+    return 1;
+  return 2;
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline int
+zeroinfnan (uint64_t i)
+{
+  return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
+}
+
+static double NOINLINE
+pow_scalar_special_case (double x, double y)
+{
+  uint32_t sign_bias = 0;
+  uint64_t ix, iy;
+  uint32_t topx, topy;
+
+  ix = asuint64 (x);
+  iy = asuint64 (y);
+  topx = top12 (x);
+  topy = top12 (y);
+  if (unlikely (topx - SmallPowX >= ThresPowX
+		|| (topy & 0x7ff) - SmallPowY >= ThresPowY))
+    {
+      /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0
+	 and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1.  */
+      /* Special cases: (x < 0x1p-126 or inf or nan) or
+	 (|y| < 0x1p-65 or |y| >= 0x1p63 or nan).  */
+      if (unlikely (zeroinfnan (iy)))
+	{
+	  if (2 * iy == 0)
+	    return issignaling_inline (x) ? x + y : 1.0;
+	  if (ix == asuint64 (1.0))
+	    return issignaling_inline (y) ? x + y : 1.0;
+	  if (2 * ix > 2 * asuint64 (INFINITY)
+	      || 2 * iy > 2 * asuint64 (INFINITY))
+	    return x + y;
+	  if (2 * ix == 2 * asuint64 (1.0))
+	    return 1.0;
+	  if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
+	    return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
+	  return y * y;
+	}
+      if (unlikely (zeroinfnan (ix)))
+	{
+	  double x2 = x * x;
+	  if (ix >> 63 && checkint (iy) == 1)
+	    {
+	      x2 = -x2;
+	      sign_bias = 1;
+	    }
+#if WANT_SIMD_EXCEPT
+	  if (2 * ix == 0 && iy >> 63)
+	    return __math_divzero (sign_bias);
+#endif
+	  return iy >> 63 ? 1 / x2 : x2;
+	}
+      /* Here x and y are non-zero finite.  */
+      if (ix >> 63)
+	{
+	  /* Finite x < 0.  */
+	  int yint = checkint (iy);
+	  if (yint == 0)
+#if WANT_SIMD_EXCEPT
+	    return __math_invalid (x);
+#else
+	    return __builtin_nan ("");
+#endif
+	  if (yint == 1)
+	    sign_bias = SignBias;
+	  ix &= 0x7fffffffffffffff;
+	  topx &= 0x7ff;
+	}
+      if ((topy & 0x7ff) - SmallPowY >= ThresPowY)
+	{
+	  /* Note: sign_bias == 0 here because y is not odd.  */
+	  if (ix == asuint64 (1.0))
+	    return 1.0;
+	  /* |y| < 2^-65, x^y ~= 1 + y*log(x).  */
+	  if ((topy & 0x7ff) < SmallPowY)
+	    return 1.0;
+#if WANT_SIMD_EXCEPT
+	  return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0)
+							 : __math_uflow (0);
+#else
+	  return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0;
+#endif
+	}
+      if (topx == 0)
+	{
+	  /* Normalize subnormal x so exponent becomes negative.  */
+	  ix = asuint64 (x * 0x1p52);
+	  ix &= 0x7fffffffffffffff;
+	  ix -= 52ULL << 52;
+	}
+    }
+
+  double lo;
+  double hi = log_inline (ix, &lo);
+  double ehi = y * hi;
+  double elo = y * lo + fma (y, hi, -ehi);
+  return exp_inline (ehi, elo, sign_bias);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c
new file mode 100644
index 000000000000..dc01ed5bac93
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c
@@ -0,0 +1,95 @@
+/*
+ * Double-precision vector hypot(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+  uint64x2_t tiny_bound, thres;
+} data = {
+  .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511).  */
+  .thres = V2 (0x3fe0000000000000),	 /* asuint (0x1p511) - tiny_bound.  */
+};
+#else
+static const struct data
+{
+  uint64x2_t tiny_bound;
+  uint32x4_t thres;
+} data = {
+  .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969).  */
+  .thres = V4 (0x7c900000),		 /* asuint (inf) - tiny_bound.  */
+};
+#endif
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
+	      uint32x2_t special)
+{
+  return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special));
+}
+
+/* Vector implementation of double-precision hypot.
+   Maximum error observed is 1.21 ULP:
+   _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222)
+    got 0x1.6a1b19400964ep-204
+   want 0x1.6a1b19400964dp-204.  */
+#if WANT_SIMD_EXCEPT
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  float64x2_t ay = vabsq_f64 (y);
+
+  uint64x2_t ix = vreinterpretq_u64_f64 (ax);
+  uint64x2_t iy = vreinterpretq_u64_f64 (ay);
+
+  /* Extreme values, NaNs, and infinities should be handled by the scalar
+     fallback for correct flag handling.  */
+  uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres);
+  uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres);
+  ax = v_zerofy_f64 (ax, specialx);
+  ay = v_zerofy_f64 (ay, specialy);
+  uint32x2_t special = vaddhn_u64 (specialx, specialy);
+
+  float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay);
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (x, y, sqsum, special);
+
+  return vsqrtq_f64 (sqsum);
+}
+#else
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
+
+  uint32x2_t special
+      = vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
+		  vget_low_u32 (d->thres));
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (x, y, sqsum, special);
+
+  return vsqrtq_f64 (sqsum);
+}
+#endif
+
+TEST_SIG (V, D, 2, hypot, -10.0, 10.0)
+TEST_ULP (V_NAME_D2 (hypot), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c
new file mode 100644
index 000000000000..69634875be5a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c
@@ -0,0 +1,96 @@
+/*
+ * Single-precision vector hypot(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+  uint32x4_t tiny_bound, thres;
+} data = {
+  .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63).  */
+  .thres = V4 (0x3f000000),	 /* asuint (0x1p63) - tiny_bound.  */
+};
+#else
+static const struct data
+{
+  uint32x4_t tiny_bound;
+  uint16x8_t thres;
+} data = {
+  .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102).  */
+  .thres = V8 (0x7300),		 /* asuint (inf) - tiny_bound.  */
+};
+#endif
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
+	      uint16x4_t special)
+{
+  return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special));
+}
+
+/* Vector implementation of single-precision hypot.
+   Maximum error observed is 1.21 ULP:
+   _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13
+						    want 0x1.6a41dp-13.  */
+#if WANT_SIMD_EXCEPT
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float32x4_t ax = vabsq_f32 (x);
+  float32x4_t ay = vabsq_f32 (y);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (ax);
+  uint32x4_t iy = vreinterpretq_u32_f32 (ay);
+
+  /* Extreme values, NaNs, and infinities should be handled by the scalar
+     fallback for correct flag handling.  */
+  uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres);
+  uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres);
+  ax = v_zerofy_f32 (ax, specialx);
+  ay = v_zerofy_f32 (ay, specialy);
+  uint16x4_t special = vaddhn_u32 (specialx, specialy);
+
+  float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay);
+
+  if (unlikely (v_any_u16h (special)))
+    return special_case (x, y, sqsum, special);
+
+  return vsqrtq_f32 (sqsum);
+}
+#else
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
+
+  uint16x4_t special
+      = vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
+		  vget_low_u16 (d->thres));
+
+  if (unlikely (v_any_u16h (special)))
+    return special_case (x, y, sqsum, special);
+
+  return vsqrtq_f32 (sqsum);
+}
+#endif
+
+HALF_WIDTH_ALIAS_F2 (hypot)
+
+TEST_SIG (V, F, 2, hypot, -10.0, 10.0)
+TEST_ULP (V_NAME_F2 (hypot), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c
new file mode 100644
index 000000000000..94e3f4482079
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c
@@ -0,0 +1,118 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+  uint32x4_t special_bound;
+  float64x2_t c0, c2;
+  double c1, c3, ln2, c4;
+} data = {
+  /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+  .c0 = V2 (-0x1.ffffffffffff7p-2),
+  .c1 = 0x1.55555555170d4p-2,
+  .c2 = V2 (-0x1.0000000399c27p-2),
+  .c3 = 0x1.999b2e90e94cap-3,
+  .c4 = -0x1.554e550bd501ep-3,
+  .ln2 = 0x1.62e42fefa39efp-1,
+  .sign_exp_mask = V2 (0xfff0000000000000),
+  .off = V2 (0x3fe6900900000000),
+  /* Lower bound is 0x0010000000000000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound - offset (which wraps around).  */
+  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+  .special_bound = V4 (0x7fe00000), /* asuint64(inf) -  asuint64(0x1p-126).  */
+};
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+	      uint32x2_t special, const struct data *d)
+{
+  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+  return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Double-precision vector log routine.
+   The maximum observed error is 2.17 ULP:
+   _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+				     want 0x1.ffffff1cca045p-2.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint64x2_t u = vreinterpretq_u64_f64 (x);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+  struct entry e = lookup (u_off);
+
+  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+				 vget_low_u32 (d->special_bound));
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
+  float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+  y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
+  y = vfmaq_f64 (p, r2, y);
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (hi, u_off, y, r2, special, d);
+  return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log), 1.67)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000)
+TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000)
+TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c
new file mode 100644
index 000000000000..c2b8f1c54f0e
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c
@@ -0,0 +1,132 @@
+/*
+ * Double-precision vector log10(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+  uint32x4_t special_bound;
+  double invln10, log10_2;
+  double c1, c3;
+  float64x2_t c0, c2, c4;
+} data = {
+  /* Computed from log coefficients divided by log(10) then rounded to double
+     precision.  */
+  .c0 = V2 (-0x1.bcb7b1526e506p-3),
+  .c1 = 0x1.287a7636be1d1p-3,
+  .c2 = V2 (-0x1.bcb7b158af938p-4),
+  .c3 = 0x1.63c78734e6d07p-4,
+  .c4 = V2 (-0x1.287461742fee4p-4),
+  .invln10 = 0x1.bcb7b1526e50ep-2,
+  .log10_2 = 0x1.34413509f79ffp-2,
+  .off = V2 (0x3fe6900900000000),
+  .sign_exp_mask = V2 (0xfff0000000000000),
+  /* Lower bound is 0x0010000000000000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound - offset (which wraps around).  */
+  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000.  */
+};
+
+#define N (1 << V_LOG10_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t log10c;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  struct entry e;
+  uint64_t i0
+      = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+  uint64_t i1
+      = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.log10c = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+	      uint32x2_t special, const struct data *d)
+{
+  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+  return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Fast implementation of double-precision vector log10
+   is a slight modification of double-precision vector log.
+   Max ULP error: < 2.5 ulp (nearest rounding.)
+   Maximum measured at 2.46 ulp for x in [0.96, 0.97]
+   _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+				       want 0x1.fff6be3cae4b9p-6.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint64x2_t u = vreinterpretq_u64_f64 (x);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+  struct entry e = lookup (u_off);
+
+  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+				 vget_low_u32 (d->special_bound));
+
+  /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2).  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+
+  /* hi = r / log(10) + log10(c) + k*log10(2).
+     Constants in v_log10_data.c are computed (in extended precision) as
+     e.log10c := e.logc * invln10.  */
+  float64x2_t cte = vld1q_f64 (&d->invln10);
+  float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
+
+  /* y = log10(1+r) + n * log10(2).  */
+  hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+  y = vfmaq_f64 (y, d->c4, r2);
+  y = vfmaq_f64 (p, y, r2);
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (hi, u_off, y, r2, special, d);
+  return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log10, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log10), 1.97)
+TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c
new file mode 100644
index 000000000000..907c1051e086
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c
@@ -0,0 +1,106 @@
+/*
+ * Single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
+  uint32x4_t off, offset_lower_bound;
+  uint16x8_t special_bound;
+  uint32x4_t mantissa_mask;
+  float c1, c3, c5, c7;
+} data = {
+  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+      [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+  .c0 = V4 (-0x1.bcb79cp-3f),
+  .c1 = 0x1.2879c8p-3f,
+  .c2 = V4 (-0x1.bcd472p-4f),
+  .c3 = 0x1.6408f8p-4f,
+  .c4 = V4 (-0x1.246f8p-4f),
+  .c5 = 0x1.f0e514p-5f,
+  .c6 = V4 (-0x1.0fc92cp-4f),
+  .c7 = 0x1.f5f76ap-5f,
+  .ln2 = V4 (0x1.62e43p-1f),
+  .inv_ln10 = V4 (0x1.bcb7b2p-2f),
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
+	      uint16x4_t cmp, const struct data *d)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
+}
+
+/* Fast implementation of AdvSIMD log10f,
+   uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and
+   an order 9 polynomial.
+   Maximum error: 3.305ulps (nearest rounding.)
+   _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+				 want 0x1.ffe2f4p-4.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t c1357 = vld1q_f32 (&d->c1);
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u_off = vsubq_u32 (u_off, d->off);
+  float32x4_t n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+				 vget_low_u16 (d->special_bound));
+
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log10(1+r) + n * log10(2).  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+
+  float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+  float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+  float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+  float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+
+  float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
+  float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
+  float32x4_t poly = vfmaq_f32 (c01, r2, p27);
+
+  /* y = Log10(2) * n + poly * InvLn(10).  */
+  float32x4_t y = vfmaq_f32 (r, d->ln2, n);
+  y = vmulq_f32 (y, d->inv_ln10);
+
+  if (unlikely (v_any_u16h (special)))
+    return special_case (y, u_off, poly, r2, special, d);
+  return vfmaq_f32 (y, poly, r2);
+}
+
+HALF_WIDTH_ALIAS_F1 (log10)
+
+TEST_SIG (V, F, 1, log10, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log10), 2.81)
+TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100)
+TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100)
+TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c
new file mode 100644
index 000000000000..42a0c5793920
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c
@@ -0,0 +1,61 @@
+/*
+ * Double-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+  struct v_log1p_data d;
+  uint64x2_t inf, minus_one;
+} data = { .d = V_LOG1P_CONSTANTS_TABLE,
+	   .inf = V2 (0x7ff0000000000000),
+	   .minus_one = V2 (0xbff0000000000000) };
+
+#define BottomMask v_u64 (0xffffffff)
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
+{
+  /* Side-step special lanes so fenv exceptions are not triggered
+     inadvertently.  */
+  float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
+  return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
+}
+
+/* Vector log1p approximation using polynomial on reduced interval. Routine is
+   a modification of the algorithm used in scalar log1p, with no shortcut for
+   k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
+   _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
+					want 0x1.fd61d0727429fp+2 .  */
+VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+  uint64x2_t special_cases
+      = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
+
+  if (unlikely (v_any_u64 (special_cases)))
+    return special_case (x, special_cases, d);
+
+  return log1p_inline (x, &d->d);
+}
+
+TEST_SIG (V, D, 1, log1p, -0.9, 10.0)
+TEST_ULP (V_NAME_D1 (log1p), 1.95)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000)
+TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c
new file mode 100644
index 000000000000..94b90249128f
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c
@@ -0,0 +1,92 @@
+/*
+ * Single-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
+
+const static struct data
+{
+  uint32x4_t minus_one, thresh;
+  struct v_log1pf_data d;
+} data = {
+  .d = V_LOG1PF_CONSTANTS_TABLE,
+  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound.  */
+  .minus_one = V4 (0xbf800000),
+};
+
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+#  define TinyBound v_u32 (0x34000000)
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
+{
+  /* Side-step special lanes so fenv exceptions are not triggered
+     inadvertently.  */
+  float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+  return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+   error is 1.69 ULP:
+   _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+				 want 0x1.cfcbdcp-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+  uint32x4_t special_cases
+      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
+		   vcgeq_u32 (ix, d->minus_one));
+
+  if (unlikely (v_any_u32 (special_cases)))
+    return special_case (x, special_cases, d);
+
+  return log1pf_inline (x, &d->d);
+}
+
+#else
+
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+  return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+   error is 1.63 ULP:
+   _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+				 want 0x1.fdcb16p-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
+{
+  uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+					vcaleq_f32 (x, v_f32 (0x1p127f)));
+
+  if (unlikely (v_any_u32 (special_cases)))
+    return special_case (x, special_cases);
+
+  return log1pf_inline (x, ptr_barrier (&data));
+}
+
+#endif
+
+HALF_WIDTH_ALIAS_F1 (log1p)
+
+TEST_SIG (V, F, 1, log1p, -0.9, 10.0)
+TEST_ULP (V_NAME_F1 (log1p), 1.20)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000)
+TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000)
+TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000)
+TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c
new file mode 100644
index 000000000000..7d2e44dad2c9
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c
@@ -0,0 +1,123 @@
+/*
+ * Double-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+  uint32x4_t special_bound;
+  float64x2_t c0, c2;
+  double c1, c3, invln2, c4;
+} data = {
+  /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
+     and N = 128, then scaled by log2(e) in extended precision and rounded back
+     to double precision.  */
+  .c0 = V2 (-0x1.71547652b8300p-1),
+  .c1 = 0x1.ec709dc340953p-2,
+  .c2 = V2 (-0x1.71547651c8f35p-2),
+  .c3 = 0x1.2777ebe12dda5p-2,
+  .c4 = -0x1.ec738d616fe26p-3,
+  .invln2 = 0x1.71547652b82fep0,
+  .off = V2 (0x3fe6900900000000),
+  .sign_exp_mask = V2 (0xfff0000000000000),
+  /* Lower bound is 0x0010000000000000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound - offset (which wraps around).  */
+  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022).  */
+};
+
+#define N (1 << V_LOG2_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t log2c;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  struct entry e;
+  uint64_t i0
+      = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+  uint64_t i1
+      = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.log2c = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+	      uint32x2_t special, const struct data *d)
+{
+  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+  return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Double-precision vector log2 routine. Implements the same algorithm as
+   vector log10, with coefficients and table entries scaled in extended
+   precision. The maximum observed error is 2.58 ULP:
+   _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+				      want 0x1.fffb34198d9ddp-5.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint64x2_t u = vreinterpretq_u64_f64 (x);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+  struct entry e = lookup (u_off);
+
+  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+				 vget_low_u32 (d->special_bound));
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+
+  float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
+  float64x2_t hi
+      = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
+
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+  y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
+  y = vfmaq_f64 (p, r2, y);
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (hi, u_off, y, r2, special, d);
+  return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log2, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log2), 2.09)
+TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c
new file mode 100644
index 000000000000..3053c64bc552
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c
@@ -0,0 +1,102 @@
+/*
+ * Single-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t c0, c2, c4, c6, c8;
+  uint32x4_t off, offset_lower_bound;
+  uint16x8_t special_bound;
+  uint32x4_t mantissa_mask;
+  float c1, c3, c5, c7;
+} data = {
+  /* Coefficients generated using Remez algorithm approximate
+     log2(1+r)/r for r in [ -1/3, 1/3 ].
+     rel error: 0x1.c4c4b0cp-26.  */
+  .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)).  */
+  .c1 = -0x1.715458p-1f,
+  .c2 = V4 (0x1.ec701cp-2f),
+  .c3 = -0x1.7171a4p-2f,
+  .c4 = V4 (0x1.27a0b8p-2f),
+  .c5 = -0x1.e5143ep-3f,
+  .c6 = V4 (0x1.9d8ecap-3f),
+  .c7 = -0x1.c675bp-3f,
+  .c8 = V4 (0x1.9e495p-3f),
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
+	      uint16x4_t cmp, const struct data *d)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+}
+
+/* Fast implementation for single precision AdvSIMD log2,
+   relies on same argument reduction as AdvSIMD logf.
+   Maximum error: 2.48 ULPs
+   _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+				want 0x1.a9be8p-2.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u_off = vsubq_u32 (u_off, d->off);
+  float32x4_t n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+				 vget_low_u16 (d->special_bound));
+
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log2(1+r) + n.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+
+  float32x4_t c1357 = vld1q_f32 (&d->c1);
+  float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+  float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+  float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+  float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+  float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
+  float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
+  float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
+  float32x4_t p = vfmaq_f32 (c01, r2, p28);
+
+  if (unlikely (v_any_u16h (special)))
+    return special_case (n, u_off, p, r, special, d);
+  return vfmaq_f32 (n, p, r);
+}
+
+HALF_WIDTH_ALIAS_F1 (log2)
+
+TEST_SIG (V, F, 1, log2, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log2), 1.99)
+TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c
new file mode 100644
index 000000000000..84705fad05ee
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  float32x4_t c2, c4, c6, ln2;
+  uint32x4_t off, offset_lower_bound, mantissa_mask;
+  uint16x8_t special_bound;
+  float c1, c3, c5, c0;
+} data = {
+  /* 3.34 ulp error.  */
+  .c0 = -0x1.3e737cp-3f,
+  .c1 = 0x1.5a9aa2p-3f,
+  .c2 = V4 (-0x1.4f9934p-3f),
+  .c3 = 0x1.961348p-3f,
+  .c4 = V4 (-0x1.00187cp-2f),
+  .c5 = 0x1.555d7cp-2f,
+  .c6 = V4 (-0x1.ffffc8p-2f),
+  .ln2 = V4 (0x1.62e43p-1f),
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff)
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+	      uint16x4_t cmp, const struct data *d)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t c1350 = vld1q_f32 (&d->c1);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  float32x4_t n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+  uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+			     vget_low_u16 (d->special_bound));
+
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log(1+r) + n*ln2.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
+  float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
+  float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
+  float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
+  p = vfmaq_laneq_f32 (p, r2, c1350, 3);
+
+  q = vfmaq_f32 (q, p, r2);
+  y = vfmaq_f32 (y, q, r2);
+  p = vfmaq_f32 (r, d->ln2, n);
+
+  if (unlikely (v_any_u16h (cmp)))
+    return special_case (p, u_off, y, r2, cmp, d);
+  return vfmaq_f32 (p, y, r2);
+}
+
+HALF_WIDTH_ALIAS_F1 (log)
+
+TEST_SIG (V, F, 1, log, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log), 2.9)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000)
+TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000)
+TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c
new file mode 100644
index 000000000000..da2fcbff8514
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c
@@ -0,0 +1,33 @@
+/*
+ * Double-precision vector modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modf algorithm. Produces exact values in all rounding modes.  */
+float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int)
+{
+  /* Get integer component of x.  */
+  float64x2_t rounded = vrndq_f64 (x);
+  vst1q_f64 (out_int, rounded);
+
+  /* Subtract integer component from input.  */
+  uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded));
+
+  /* Return +0 for integer x.  */
+  uint64x2_t is_integer = vceqq_f64 (x, rounded);
+  return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer));
+}
+
+TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVnN2vl8_modf_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c
new file mode 100644
index 000000000000..0a646b24cb1a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c
@@ -0,0 +1,34 @@
+/*
+ * Single-precision vector modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modff algorithm. Produces exact values in all rounding modes.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x,
+						    float *out_int)
+{
+  /* Get integer component of x.  */
+  float32x4_t rounded = vrndq_f32 (x);
+  vst1q_f32 (out_int, rounded);
+
+  /* Subtract integer component from input.  */
+  uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded));
+
+  /* Return +0 for integer x.  */
+  uint32x4_t is_integer = vceqq_f32 (x, rounded);
+  return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer));
+}
+
+TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVnN4vl4_modff_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c
new file mode 100644
index 000000000000..db9d6e9ba14b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c
@@ -0,0 +1,284 @@
+/*
+ * Double-precision vector pow function.
+ *
+ * Copyright (c) 2020-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Defines parameters of the approximation and scalar fallback.  */
+#include "finite_pow.h"
+
+#define VecSmallPowX v_u64 (SmallPowX)
+#define VecThresPowX v_u64 (ThresPowX)
+#define VecSmallPowY v_u64 (SmallPowY)
+#define VecThresPowY v_u64 (ThresPowY)
+
+static const struct data
+{
+  uint64x2_t inf;
+  float64x2_t small_powx;
+  uint64x2_t offset, mask;
+  uint64x2_t mask_sub_0, mask_sub_1;
+  float64x2_t log_c0, log_c2, log_c4, log_c5;
+  double log_c1, log_c3;
+  double ln2_lo, ln2_hi;
+  uint64x2_t small_exp, thres_exp;
+  double ln2_lo_n, ln2_hi_n;
+  double inv_ln2_n, exp_c2;
+  float64x2_t exp_c0, exp_c1;
+} data = {
+  /* Power threshold.  */
+  .inf = V2 (0x7ff0000000000000),
+  .small_powx = V2 (0x1p-126),
+  .offset = V2 (Off),
+  .mask = V2 (0xfffULL << 52),
+  .mask_sub_0 = V2 (1ULL << 52),
+  .mask_sub_1 = V2 (52ULL << 52),
+  /* Coefficients copied from v_pow_log_data.c
+     relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
+     Coefficients are scaled to match the scaling during evaluation.  */
+  .log_c0 = V2 (0x1.555555555556p-2 * -2),
+  .log_c1 = -0x1.0000000000006p-2 * -2,
+  .log_c2 = V2 (0x1.999999959554ep-3 * 4),
+  .log_c3 = -0x1.555555529a47ap-3 * 4,
+  .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
+  .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
+     (0.550 without fma) if |x| < ln2/512.  */
+  .exp_c0 = V2 (0x1.fffffffffffd4p-2),
+  .exp_c1 = V2 (0x1.5555571d6ef9p-3),
+  .exp_c2 = 0x1.5555576a5adcep-5,
+  .small_exp = V2 (0x3c90000000000000),
+  .thres_exp = V2 (0x03f0000000000000),
+  .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2.  */
+  .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N.  */
+  .ln2_lo_n = -0x1.c610ca86c3899p-45,
+};
+
+/* This version implements an algorithm close to scalar pow but
+   - does not implement the trick in the exp's specialcase subroutine to avoid
+     double-rounding,
+   - does not use a tail in the exponential core computation,
+   - and pow's exp polynomial order and table bits might differ.
+
+   Maximum measured error is 1.04 ULPs:
+   _ZGVnN2vv_pow(0x1.024a3e56b3c3p-136, 0x1.87910248b58acp-13)
+     got 0x1.f71162f473251p-1
+    want 0x1.f71162f473252p-1.  */
+
+static inline float64x2_t
+v_masked_lookup_f64 (const double *table, uint64x2_t i)
+{
+  return (float64x2_t){
+    table[(i[0] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)],
+    table[(i[1] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)]
+  };
+}
+
+/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
+   additional 15 bits precision.  IX is the bit representation of x, but
+   normalized in the subnormal range using the sign bit for the exponent.  */
+static inline float64x2_t
+v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+{
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  uint64x2_t tmp = vsubq_u64 (ix, d->offset);
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+  /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
+  float64x2_t invc = v_masked_lookup_f64 (__v_pow_log_data.invc, tmp);
+  float64x2_t logc = v_masked_lookup_f64 (__v_pow_log_data.logc, tmp);
+  float64x2_t logctail = v_masked_lookup_f64 (__v_pow_log_data.logctail, tmp);
+  /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
+     |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
+  /* k*Ln2 + log(c) + r.  */
+  float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
+  float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
+  float64x2_t t2 = vaddq_f64 (t1, r);
+  float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
+  float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
+  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
+  float64x2_t ar2 = vmulq_f64 (r, ar);
+  float64x2_t ar3 = vmulq_f64 (r, ar2);
+  /* k*Ln2 + log(c) + r + A[0]*r*r.  */
+  float64x2_t hi = vaddq_f64 (t2, ar2);
+  float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
+  float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
+  /* p = log1p(r) - r - A[0]*r*r.  */
+  float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
+  float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
+  float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
+  float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
+  float64x2_t p = vfmaq_f64 (a34, ar2, a56);
+  p = vfmaq_f64 (a12, ar2, p);
+  p = vmulq_f64 (ar3, p);
+  float64x2_t lo
+      = vaddq_f64 (vaddq_f64 (vaddq_f64 (vaddq_f64 (lo1, lo2), lo3), lo4), p);
+  float64x2_t y = vaddq_f64 (hi, lo);
+  *tail = vaddq_f64 (vsubq_f64 (hi, y), lo);
+  return y;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+exp_special_case (float64x2_t x, float64x2_t xtail)
+{
+  return (float64x2_t){ exp_nosignbias (x[0], xtail[0]),
+			exp_nosignbias (x[1], xtail[1]) };
+}
+
+/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.  */
+static inline float64x2_t
+v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
+{
+  /* Fallback to scalar exp_inline for all lanes if any lane
+     contains value of x s.t. |x| <= 2^-54 or >= 512.  */
+  uint64x2_t uoflowx = vcgeq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
+      d->thres_exp);
+  if (unlikely (v_any_u64 (uoflowx)))
+    return exp_special_case (x, vnegq_f64 (neg_xtail));
+
+  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+  /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N].  */
+  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+  float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
+  float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
+  float64x2_t kd = vrndnq_f64 (z);
+  uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
+  float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
+  float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
+  r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
+  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+  r = vsubq_f64 (r, neg_xtail);
+  /* 2^(k/N) ~= scale.  */
+  uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
+  uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  uint64x2_t sbits = v_lookup_u64 (SBits, idx);
+  sbits = vaddq_u64 (sbits, top);
+  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
+  tmp = vfmaq_f64 (d->exp_c0, r, tmp);
+  tmp = vfmaq_f64 (r, r2, tmp);
+  float64x2_t scale = vreinterpretq_f64_u64 (sbits);
+  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+     is no spurious underflow here even without fma.  */
+  return vfmaq_f64 (scale, scale, tmp);
+}
+
+static float64x2_t NOINLINE VPCS_ATTR
+scalar_fallback (float64x2_t x, float64x2_t y)
+{
+  return (float64x2_t){ pow_scalar_special_case (x[0], y[0]),
+			pow_scalar_special_case (x[1], y[1]) };
+}
+
+float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+  /* Case of x <= 0 is too complicated to be vectorised efficiently here,
+     fallback to scalar pow for all lanes if any x < 0 detected.  */
+  if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x))))
+    return scalar_fallback (x, y);
+
+  uint64x2_t vix = vreinterpretq_u64_f64 (x);
+  uint64x2_t viy = vreinterpretq_u64_f64 (y);
+  uint64x2_t iay = vandq_u64 (viy, d->inf);
+
+  /* Special cases of x or y.  */
+#if WANT_SIMD_EXCEPT
+  /* Small or large.  */
+  uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
+  uint64x2_t vabstopy = vshrq_n_u64 (iay, 52);
+  uint64x2_t specialx
+      = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
+  uint64x2_t specialy
+      = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
+#else
+  /* The case y==0 does not trigger a special case, since in this case it is
+     necessary to fix the result only if x is a signalling nan, which already
+     triggers a special case. We test y==0 directly in the scalar fallback.  */
+  uint64x2_t iax = vandq_u64 (vix, d->inf);
+  uint64x2_t specialx = vcgeq_u64 (iax, d->inf);
+  uint64x2_t specialy = vcgeq_u64 (iay, d->inf);
+#endif
+  uint64x2_t special = vorrq_u64 (specialx, specialy);
+  /* Fallback to scalar on all lanes if any lane is inf or nan.  */
+  if (unlikely (v_any_u64 (special)))
+    return scalar_fallback (x, y);
+
+  /* Small cases of x: |x| < 0x1p-126.  */
+  uint64x2_t smallx = vcaltq_f64 (x, d->small_powx);
+  if (unlikely (v_any_u64 (smallx)))
+    {
+      /* Update ix if top 12 bits of x are 0.  */
+      uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52));
+      if (unlikely (v_any_u64 (sub_x)))
+	{
+	  /* Normalize subnormal x so exponent becomes negative.  */
+	  uint64x2_t vix_norm = vreinterpretq_u64_f64 (
+	      vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
+	  vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
+	  vix = vbslq_u64 (sub_x, vix_norm, vix);
+	}
+    }
+
+  /* Vector Log(ix, &lo).  */
+  float64x2_t vlo;
+  float64x2_t vhi = v_log_inline (vix, &vlo, d);
+
+  /* Vector Exp(y_loghi, y_loglo).  */
+  float64x2_t vehi = vmulq_f64 (y, vhi);
+  float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
+  float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
+  return v_exp_inline (vehi, neg_velo, d);
+}
+
+TEST_SIG (V, D, 2, pow)
+TEST_ULP (V_NAME_D2 (pow), 0.55)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
+/* Wide intervals spanning the whole domain but shared between x and y.  */
+#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n)                                \
+  TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n)                     \
+  TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n)                   \
+  TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n)                   \
+  TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+#define EXPAND(str) str##000000000
+#define SHL52(str) EXPAND (str)
+V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
+V_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
+V_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
+V_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
+V_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
+V_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
+V_POW_INTERVAL2 (0, inf, 0, inf, 1000)
+/* x~1 or y~1.  */
+V_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
+V_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
+V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
+/* around argmaxs of ULP error.  */
+V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
+V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
+/* x is negative, y is odd or even integer, or y is real not integer.  */
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+/* 1.0^y.  */
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c
new file mode 100644
index 000000000000..47f74cf38ab0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c
@@ -0,0 +1,209 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Thresh v_u32 (0x7f000000) /* Max - Min.  */
+#define MantissaMask v_u32 (0x007fffff)
+
+#define A d->log2_poly
+#define C d->exp2f_poly
+
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
+#define Off v_u32 (0x3f35d000)
+
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_EXP2F_TABLE_BITS 5
+#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
+#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
+
+static const struct data
+{
+  struct
+  {
+    double invc, logc;
+  } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
+  float64x2_t log2_poly[4];
+  uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
+  float64x2_t exp2f_poly[3];
+} data = {
+  .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
+	       {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
+	       {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
+	       {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
+	       {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
+	       {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
+	       {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
+	       {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
+	       {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
+	       {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
+	       {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
+	       {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
+	       {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
+	       {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
+	       {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
+	       {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
+	       {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
+	       {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
+	       {0x1p+0, 0x0p+0 * Scale},
+	       {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
+	       {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
+	       {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
+	       {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
+	       {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
+	       {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
+	       {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
+	       {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
+	       {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
+	       {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
+	       {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
+	       {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
+	       {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
+  .log2_poly = { /* rel err: 1.5 * 2^-30.  */
+		 V2 (-0x1.6ff5daa3b3d7cp-2 * Scale),
+		 V2 (0x1.ec81d03c01aebp-2 * Scale),
+		 V2 (-0x1.71547bb43f101p-1 * Scale),
+		 V2 (0x1.7154764a815cbp0 * Scale)},
+  .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
+		0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
+		0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
+		0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+		0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
+		0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
+		0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
+		0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+		0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
+		0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
+		0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
+  .exp2f_poly = { /* rel err: 1.69 * 2^-34.  */
+		  V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale),
+		  V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale),
+		  V2 (0x1.62e42ff0c52d6p-1 / Scale)}};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
+{
+  return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+static inline float64x2_t
+ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k,
+	    float64x2_t invc, float64x2_t logc, float64x2_t y)
+{
+
+  /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc);
+  float64x2_t y0 = vaddq_f64 (logc, k);
+
+  /* Polynomial to approximate log1p(r)/ln2.  */
+  float64x2_t logx = vfmaq_f64 (A[1], r, A[0]);
+  logx = vfmaq_f64 (A[2], logx, r);
+  logx = vfmaq_f64 (A[3], logx, r);
+  logx = vfmaq_f64 (y0, logx, r);
+
+  return vmulq_f64 (logx, y);
+}
+
+static inline float64x2_t
+log2_lookup (const struct data *d, uint32_t i)
+{
+  return vld1q_f64 (
+      &d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc);
+}
+
+static inline uint64x1_t
+exp2f_lookup (const struct data *d, uint64_t i)
+{
+  return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]);
+}
+
+static inline float32x2_t
+powf_core (const struct data *d, float64x2_t ylogx)
+{
+  /* N*x = k + r with r in [-1/2, 1/2].  */
+  float64x2_t kd = vrndnq_f64 (ylogx);
+  int64x2_t ki = vcvtaq_s64_f64 (ylogx);
+  float64x2_t r = vsubq_f64 (ylogx, kd);
+
+  /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
+  uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)),
+			       exp2f_lookup (d, vgetq_lane_s64 (ki, 1)));
+  t = vaddq_u64 (
+      t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS)));
+  float64x2_t s = vreinterpretq_f64_u64 (t);
+  float64x2_t p = vfmaq_f64 (C[1], r, C[0]);
+  p = vfmaq_f64 (C[2], r, p);
+  p = vfmaq_f64 (s, p, vmulq_f64 (s, r));
+  return vcvt_f32_f64 (p);
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint32x4_t u = vreinterpretq_u32_f32 (x);
+  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
+  uint32x4_t tmp = vsubq_u32 (u, Off);
+  uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
+  float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top));
+  int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
+			     23 - V_EXP2F_TABLE_BITS); /* arithmetic shift.  */
+
+  /* Use double precision for each lane: split input vectors into lo and hi
+     halves and promote.  */
+  float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)),
+	      tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)),
+	      tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)),
+	      tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3));
+
+  float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)),
+	      iz_hi = vcvt_high_f64_f32 (iz);
+
+  float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))),
+	      k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k));
+
+  float64x2_t invc_lo = vzip1q_f64 (tab0, tab1),
+	      invc_hi = vzip1q_f64 (tab2, tab3),
+	      logc_lo = vzip2q_f64 (tab0, tab1),
+	      logc_hi = vzip2q_f64 (tab2, tab3);
+
+  float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)),
+	      y_hi = vcvt_high_f64_f32 (y);
+
+  float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo);
+  float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi);
+
+  uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo),
+				     vreinterpretq_u32_f64 (ylogx_hi));
+
+  cmp = vorrq_u32 (
+      cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)),
+		      vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS))
+				   >> 47)));
+
+  float32x2_t p_lo = powf_core (d, ylogx_lo);
+  float32x2_t p_hi = powf_core (d, ylogx_hi);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp);
+  return vcombine_f32 (p_lo, p_hi);
+}
+
+HALF_WIDTH_ALIAS_F2 (pow)
+
+TEST_SIG (V, F, 2, pow)
+TEST_ULP (V_NAME_F2 (pow), 2.1)
+TEST_DISABLE_FENV (V_NAME_F2 (pow))
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c
new file mode 100644
index 000000000000..0461bbb99405
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c
@@ -0,0 +1,105 @@
+/*
+ * Double-precision vector sin function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "test_defs.h"
+#include "test_sig.h"
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t poly[7];
+  float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
+} data = {
+  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+	    V2 (-0x1.9e9540300a1p-41) },
+
+  .range_val = V2 (0x1p23),
+  .inv_pi = V2 (0x1.45f306dc9c883p-2),
+  .pi_1 = V2 (0x1.921fb54442d18p+1),
+  .pi_2 = V2 (0x1.1a62633145c06p-53),
+  .pi_3 = V2 (0x1.c1cd129024e09p-106),
+};
+
+#if WANT_SIMD_EXCEPT
+/* asuint64(0x1p-253)), below which multiply by inv_pi underflows.  */
+# define TinyBound v_u64 (0x3020000000000000)
+/* RangeVal - TinyBound.  */
+# define Thresh v_u64 (0x1160000000000000)
+#endif
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+  return v_call_f64 (sin, x, y, cmp);
+}
+
+/* Vector (AdvSIMD) sin approximation.
+   Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
+   is 2.87 ULP:
+   _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
+				      want 0x1.fffffffa7dc05p-1
+   Maximum observed error in the entire non-special domain ([-2^23, 2^23])
+   is 3.22 ULP:
+   _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
+				       want 0x1.ffdcd125c84f8p-3.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
+  uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
+     triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+     fenv). These lanes will be fixed by special-case handler later.  */
+  uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+  r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
+#else
+  r = x;
+  cmp = vcageq_f64 (x, d->range_val);
+#endif
+
+  /* n = rint(|x|/pi).  */
+  n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+  odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f64 (r, d->pi_1, n);
+  r = vfmsq_f64 (r, d->pi_2, n);
+  r = vfmsq_f64 (r, d->pi_3, n);
+
+  /* sin(r) poly approx.  */
+  r2 = vmulq_f64 (r, r);
+  r3 = vmulq_f64 (r2, r);
+  r4 = vmulq_f64 (r2, r2);
+
+  t1 = vfmaq_f64 (C (4), C (5), r2);
+  t2 = vfmaq_f64 (C (2), C (3), r2);
+  t3 = vfmaq_f64 (C (0), C (1), r2);
+
+  y = vfmaq_f64 (t1, C (6), r4);
+  y = vfmaq_f64 (t2, y, r4);
+  y = vfmaq_f64 (t3, y, r4);
+  y = vfmaq_f64 (r, y, r3);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+TEST_SIG (V, D, 1, sin, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (sin), 3.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c
new file mode 100644
index 000000000000..83bfa45efa98
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c
@@ -0,0 +1,67 @@
+/*
+ * Double-precision vector sincos function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Define _GNU_SOURCE in order to include sincos declaration. If building
+   pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
+   be linked against the scalar sincosf from math/.  */
+#define _GNU_SOURCE
+#include <math.h>
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "v_sincos_common.h"
+
+/* sincos not available for all scalar libm implementations.  */
+#if defined(_MSC_VER) || !defined(__GLIBC__)
+static void
+sincos (double x, double *out_sin, double *out_cos)
+{
+  *out_sin = sin (x);
+  *out_cos = cos (x);
+}
+#endif
+
+static void VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, double *out_sin,
+	      double *out_cos)
+{
+  if (special[0])
+    sincos (x[0], out_sin, out_cos);
+  if (special[1])
+    sincos (x[1], out_sin + 1, out_cos + 1);
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+   one function call, using shared argument reduction and separate polynomials.
+   Largest observed error is for sin, 3.22 ULP:
+   v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+				       want -0x1.ffe9537d5dbb4p-3.  */
+VPCS_ATTR void
+_ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos)
+{
+  const struct v_sincos_data *d = ptr_barrier (&v_sincos_data);
+  uint64x2_t special = check_ge_rangeval (x, d);
+
+  float64x2x2_t sc = v_sincos_inline (x, d);
+
+  vst1q_f64 (out_sin, sc.val[0]);
+  vst1q_f64 (out_cos, sc.val[1]);
+
+  if (unlikely (v_any_u64 (special)))
+    special_case (x, special, out_sin, out_cos);
+}
+
+TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin)
+TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
+TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
+#define V_SINCOS_INTERVAL(lo, hi, n)                                          \
+  TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n)                              \
+  TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
+V_SINCOS_INTERVAL (0, 0x1p-31, 50000)
+V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000)
+V_SINCOS_INTERVAL (0x1p23, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c
new file mode 100644
index 000000000000..cd482f38d5f6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector sincos function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Define _GNU_SOURCE in order to include sincosf declaration. If building
+   pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
+   be linked against the scalar sincosf from math/.  */
+#define _GNU_SOURCE
+#include <math.h>
+
+#include "v_sincosf_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+/* sincos not available for all scalar libm implementations.  */
+#if defined(_MSC_VER) || !defined(__GLIBC__)
+static void
+sincosf (float x, float *out_sin, float *out_cos)
+{
+  *out_sin = sinf (x);
+  *out_cos = cosf (x);
+}
+#endif
+
+static void VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, float *out_sin,
+	      float *out_cos)
+{
+  for (int i = 0; i < 4; i++)
+    if (special[i])
+      sincosf (x[i], out_sin + i, out_cos + i);
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+   one function call, using shared argument reduction and separate low-order
+   polynomials.
+   Worst-case error for sin is 1.67 ULP:
+   v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+   Worst-case error for cos is 1.81 ULP:
+   v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6.  */
+VPCS_ATTR void
+_ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos)
+{
+  const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data);
+  uint32x4_t special = check_ge_rangeval (x, d);
+
+  float32x4x2_t sc = v_sincosf_inline (x, d);
+
+  vst1q_f32 (out_sin, sc.val[0]);
+  vst1q_f32 (out_cos, sc.val[1]);
+
+  if (unlikely (v_any_u32 (special)))
+    special_case (x, special, out_sin, out_cos);
+}
+
+TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos)
+TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
+TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
+#define V_SINCOSF_INTERVAL(lo, hi, n)                                         \
+  TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n)                             \
+  TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
+V_SINCOSF_INTERVAL (0, 0x1p-31, 50000)
+V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000)
+V_SINCOSF_INTERVAL (0x1p20, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c
new file mode 100644
index 000000000000..fd425202ce67
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c
@@ -0,0 +1,44 @@
+/*
+ * Double-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_sincospi_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+   one function call, using separate argument reduction and shared low-order
+   polynomials.
+   Approximation for vector double-precision sincospi(x).
+   Maximum Error 3.09 ULP:
+  _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+					      want 0x1.fd54d0b327cf4p-1
+   Maximum Error 3.16 ULP:
+  _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+					      want 0x1.fd2da484ff402p-1.  */
+VPCS_ATTR void
+_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos)
+{
+  const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data);
+
+  float64x2x2_t sc = v_sincospi_inline (x, d);
+
+  vst1q_f64 (out_sin, sc.val[0]);
+  vst1q_f64 (out_cos, sc.val[1]);
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin)
+TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59)
+TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66)
+#  define V_SINCOSPI_INTERVAL(lo, hi, n)                                      \
+    TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n)                      \
+    TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n)
+V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000)
+V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000)
+V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000)
+V_SINCOSPI_INTERVAL (0x1p63, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c
new file mode 100644
index 000000000000..760ea3d4f5e1
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c
@@ -0,0 +1,43 @@
+/*
+ * Single-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincospif_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "mathlib.h"
+
+/* Single-precision vector function allowing calculation of both sinpi and
+   cospi in one function call, using shared argument reduction and polynomials.
+   Worst-case error for sin is 3.04 ULP:
+   _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+   Worst-case error for cos is 3.18 ULP:
+   _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ */
+VPCS_ATTR void
+_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos)
+{
+  const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data);
+
+  float32x4x2_t sc = v_sincospif_inline (x, d);
+
+  vst1q_f32 (out_sin, sc.val[0]);
+  vst1q_f32 (out_cos, sc.val[1]);
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos)
+TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54)
+TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68)
+#  define V_SINCOSPIF_INTERVAL(lo, hi, n)                                     \
+    TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n)                     \
+    TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n)
+V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000)
+V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000)
+V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000)
+V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c
new file mode 100644
index 000000000000..0764434039a0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c
@@ -0,0 +1,92 @@
+/*
+ * Single-precision vector sin function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  float32x4_t poly[4];
+  float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
+} data = {
+  /* 1.886 ulp error.  */
+  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+	    V4 (0x1.5b2e76p-19f) },
+
+  .pi_1 = V4 (0x1.921fb6p+1f),
+  .pi_2 = V4 (-0x1.777a5cp-24f),
+  .pi_3 = V4 (-0x1.ee59dap-49f),
+
+  .inv_pi = V4 (0x1.45f306p-2f),
+  .range_val = V4 (0x1p20f)
+};
+
+#if WANT_SIMD_EXCEPT
+/* asuint32(0x1p-59f), below which multiply by inv_pi underflows.  */
+# define TinyBound v_u32 (0x22000000)
+/* RangeVal - TinyBound.  */
+# define Thresh v_u32 (0x27800000)
+#endif
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+  return v_call_f32 (sinf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, r, r2, y;
+  uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+  cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+  /* If fenv exceptions are to be triggered correctly, set any special lanes
+     to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+     special-case handler later.  */
+  r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
+#else
+  r = x;
+  cmp = vcageq_f32 (x, d->range_val);
+#endif
+
+  /* n = rint(|x|/pi).  */
+  n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+  odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f32 (r, d->pi_1, n);
+  r = vfmsq_f32 (r, d->pi_2, n);
+  r = vfmsq_f32 (r, d->pi_3, n);
+
+  /* y = sin(r).  */
+  r2 = vmulq_f32 (r, r);
+  y = vfmaq_f32 (C (2), C (3), r2);
+  y = vfmaq_f32 (C (1), y, r2);
+  y = vfmaq_f32 (C (0), y, r2);
+  y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+HALF_WIDTH_ALIAS_F1 (sin)
+
+TEST_SIG (V, F, 1, sin, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (sin), 1.4)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c
new file mode 100644
index 000000000000..f65ccd0c6270
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c
@@ -0,0 +1,80 @@
+/*
+ * Double-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+  struct v_expm1_data d;
+  uint64x2_t halff;
+#if WANT_SIMD_EXCEPT
+  uint64x2_t tiny_bound, thresh;
+#else
+  float64x2_t large_bound;
+#endif
+} data = {
+  .d = V_EXPM1_DATA,
+  .halff = V2 (0x3fe0000000000000),
+#if WANT_SIMD_EXCEPT
+  /* 2^-26, below which sinh(x) rounds to x.  */
+  .tiny_bound = V2 (0x3e50000000000000),
+  /* asuint(large_bound) - asuint(tiny_bound).  */
+  .thresh = V2 (0x0230000000000000),
+#else
+  /* 2^9. expm1 helper overflows for large input.  */
+  .large_bound = V2 (0x1p+9),
+#endif
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x)
+{
+  return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.52 ULP:
+   _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
+				       want -0x1.ac2f05bb66fc9p-2.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  float64x2_t halfsign = vreinterpretq_f64_u64 (
+      vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t special = vcgeq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
+#else
+  uint64x2_t special = vcageq_f64 (x, d->large_bound);
+#endif
+
+  /* Fall back to scalar variant for all lanes if any of them are special.  */
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+
+  /* Up to the point that expm1 overflows, we can use it to calculate sinh
+     using a slight rearrangement of the definition of sinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  float64x2_t t = expm1_inline (ax, &d->d);
+  t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+  return vmulq_f64 (t, halfsign);
+}
+
+TEST_SIG (V, D, 1, sinh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (sinh), 2.02)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c
new file mode 100644
index 000000000000..12dbe26b425b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+  struct v_expm1f_data expm1f_consts;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t tiny_bound, thresh;
+#else
+  float32x4_t oflow_bound;
+#endif
+} data = {
+  .expm1f_consts = V_EXPM1F_DATA,
+#if WANT_SIMD_EXCEPT
+  /* 0x1.6a09e8p-32, below which expm1f underflows.  */
+  .tiny_bound = V4 (0x2fb504f4),
+  /* asuint(oflow_bound) - asuint(tiny_bound).  */
+  .thresh = V4 (0x12fbbbb3),
+#else
+  /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+  .oflow_bound = V4 (0x1.61814ep+6),
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+	      uint32x4_t special)
+{
+  return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
+}
+
+/* Approximation for vector single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
+				 want 0x1.e469e4p-4.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  float32x4_t ax = vabsq_f32 (x);
+  float32x4_t halfsign = vreinterpretq_f32_u32 (
+      vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t special = vcgeq_u32 (
+      vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
+  ax = v_zerofy_f32 (ax, special);
+#else
+  uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
+#endif
+
+  /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+       using a slight rearrangement of the definition of asinh. This allows us
+     to retain acceptable accuracy for very small inputs.  */
+  float32x4_t t = expm1f_inline (ax, &d->expm1f_consts);
+  t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
+
+  /* Fall back to the scalar variant for any lanes that should trigger an
+     exception.  */
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, t, halfsign, special);
+
+  return vmulq_f32 (t, halfsign);
+}
+
+HALF_WIDTH_ALIAS_F1 (sinh)
+
+TEST_SIG (V, F, 1, sinh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (sinh), 1.76)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c
new file mode 100644
index 000000000000..f86d167a2ac3
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector sinpi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t poly[10];
+} data = {
+  /* Polynomial coefficients generated using Remez algorithm,
+     see sinpi.sollya for details.  */
+  .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+	    V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+	    V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+	    V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+	    V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64).  */
+/* asuint64(0x1p64) - TinyBound.  */
+# define Thresh v_u64 (0x07f0000000000000)
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+  return v_call_f64 (arm_math_sinpi, x, y, cmp);
+}
+#endif
+
+/* Approximation for vector double-precision sinpi(x).
+   Maximum Error 3.05 ULP:
+  _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1
+				      want 0x1.fb295878301cap-1.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+
+  /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
+     to avoid them under/overflowing and throwing exceptions.  */
+  float64x2_t r = v_zerofy_f64 (x, cmp);
+#else
+  float64x2_t r = x;
+#endif
+
+  /* If r is odd, the sign of the result should be inverted.  */
+  uint64x2_t odd
+      = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
+
+  /* r = x - rint(x). Range reduction to -1/2 .. 1/2.  */
+  r = vsubq_f64 (r, vrndaq_f64 (r));
+
+  /* y = sin(r).  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t r4 = vmulq_f64 (r2, r2);
+  float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r);
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, odd, cmp);
+#endif
+
+  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_D1 (sinpi), 2.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c
new file mode 100644
index 000000000000..98ba9d84d2fb
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision vector sinpi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t poly[6];
+} data = {
+  /* Taylor series coefficents for sin(pi * x).  */
+  .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+	    V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f).  */
+# define Thresh v_u32 (0x1f000000)    /* asuint32(0x1p31f) - TinyBound.  */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+  return v_call_f32 (arm_math_sinpif, x, y, cmp);
+}
+#endif
+
+/* Approximation for vector single-precision sinpi(x)
+    Maximum Error 3.03 ULP:
+    _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
+				  want 0x1.f7cd5p-1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+
+  /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
+     to avoid them under/overflowing and throwing exceptions.  */
+  float32x4_t r = v_zerofy_f32 (x, cmp);
+#else
+  float32x4_t r = x;
+#endif
+
+  /* If r is odd, the sign of the result should be inverted.  */
+  uint32x4_t odd
+      = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
+
+  /* r = x - rint(x). Range reduction to -1/2 .. 1/2.  */
+  r = vsubq_f32 (r, vrndaq_f32 (r));
+
+  /* Pairwise Horner approximation for y = sin(r * pi).  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t r4 = vmulq_f32 (r2, r2);
+  float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r);
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, odd, cmp);
+#endif
+
+  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+HALF_WIDTH_ALIAS_F1 (sinpi)
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_F1 (sinpi), 2.54)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c
new file mode 100644
index 000000000000..957f9aba3a1e
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision vector tan(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t poly[9];
+  double half_pi[2];
+  float64x2_t two_over_pi, shift;
+#if !WANT_SIMD_EXCEPT
+  float64x2_t range_val;
+#endif
+} data = {
+  /* Coefficients generated using FPMinimax.  */
+  .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
+	    V2 (0x1.ba1ba1bb46414p-5), V2 (0x1.664f47e5b5445p-6),
+	    V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9),
+	    V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11),
+	    V2 (0x1.4e4fd14147622p-12) },
+  .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
+  .two_over_pi = V2 (0x1.45f306dc9c883p-1),
+  .shift = V2 (0x1.8p52),
+#if !WANT_SIMD_EXCEPT
+  .range_val = V2 (0x1p23),
+#endif
+};
+
+#define RangeVal 0x4160000000000000  /* asuint64(0x1p23).  */
+#define TinyBound 0x3e50000000000000 /* asuint64(2^-26).  */
+#define Thresh 0x310000000000000     /* RangeVal - TinyBound.  */
+
+/* Special cases (fall back to scalar calls).  */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x)
+{
+  return v_call_f64 (tan, x, x, v_u64 (-1));
+}
+
+/* Vector approximation for double-precision tan.
+   Maximum measured error is 3.48 ULP:
+   _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+				      want -0x1.f6ccd8ecf7deap+37.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
+{
+  const struct data *dat = ptr_barrier (&data);
+  /* Our argument reduction cannot calculate q with sufficient accuracy for
+     very large inputs. Fall back to scalar routine for all lanes if any are
+     too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
+     tiny input to avoid underflow.  */
+#if WANT_SIMD_EXCEPT
+  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  /* iax - tiny_bound > range_val - tiny_bound.  */
+  uint64x2_t special
+      = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+#endif
+
+  /* q = nearest integer to 2 * x / pi.  */
+  float64x2_t q
+      = vsubq_f64 (vfmaq_f64 (dat->shift, x, dat->two_over_pi), dat->shift);
+  int64x2_t qi = vcvtq_s64_f64 (q);
+
+  /* Use q to reduce x to r in [-pi/4, pi/4], by:
+     r = x - q * pi/2, in extended precision.  */
+  float64x2_t r = x;
+  float64x2_t half_pi = vld1q_f64 (dat->half_pi);
+  r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+  r = vfmsq_laneq_f64 (r, q, half_pi, 1);
+  /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+     formula.  */
+  r = vmulq_n_f64 (r, 0.5);
+
+  /* Approximate tan(r) using order 8 polynomial.
+     tan(x) is odd, so polynomial has the form:
+     tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ...
+     Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
+     Then compute the approximation by:
+     tan(r) ~= r + r^3 * (C0 + r^2 * P(r)).  */
+  float64x2_t r2 = vmulq_f64 (r, r), r4 = vmulq_f64 (r2, r2),
+	      r8 = vmulq_f64 (r4, r4);
+  /* Offset coefficients to evaluate from C1 onwards.  */
+  float64x2_t p = v_estrin_7_f64 (r2, r4, r8, dat->poly + 1);
+  p = vfmaq_f64 (dat->poly[0], p, r2);
+  p = vfmaq_f64 (r, r2, vmulq_f64 (p, r));
+
+  /* Recombination uses double-angle formula:
+     tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
+     and reciprocity around pi/2:
+     tan(x) = 1 / (tan(pi/2 - x))
+     to assemble result using change-of-sign and conditional selection of
+     numerator/denominator, dependent on odd/even-ness of q (hence quadrant).
+   */
+  float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p);
+  float64x2_t d = vaddq_f64 (p, p);
+
+  uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
+
+#if !WANT_SIMD_EXCEPT
+  uint64x2_t special = vcageq_f64 (x, dat->range_val);
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+#endif
+
+  return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
+		    vbslq_f64 (no_recip, d, n));
+}
+
+TEST_SIG (V, D, 1, tan, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (tan), 2.99)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c
new file mode 100644
index 000000000000..ed5448649f6c
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c
@@ -0,0 +1,130 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t poly[6];
+  float pi_consts[4];
+  float32x4_t shift;
+#if !WANT_SIMD_EXCEPT
+  float32x4_t range_val;
+#endif
+} data = {
+  /* Coefficients generated using FPMinimax.  */
+  .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
+	    V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) },
+  /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi.  */
+  .pi_consts
+  = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
+  .shift = V4 (0x1.8p+23f),
+#if !WANT_SIMD_EXCEPT
+  .range_val = V4 (0x1p15f),
+#endif
+};
+
+#define RangeVal v_u32 (0x47000000)  /* asuint32(0x1p15f).  */
+#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f).  */
+#define Thresh v_u32 (0x16000000)    /* asuint32(RangeVal) - TinyBound.  */
+
+/* Special cases (fall back to scalar calls).  */
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  return v_call_f32 (tanf, x, y, cmp);
+}
+
+/* Use a full Estrin scheme to evaluate polynomial.  */
+static inline float32x4_t
+eval_poly (float32x4_t z, const struct data *d)
+{
+  float32x4_t z2 = vmulq_f32 (z, z);
+#if WANT_SIMD_EXCEPT
+  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
+     If fp exceptions are to be triggered correctly,
+     sidestep this by fixing such lanes to 0.  */
+  uint32x4_t will_uflow
+      = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
+  if (unlikely (v_any_u32 (will_uflow)))
+    z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
+#endif
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+  return v_estrin_5_f32 (z, z2, z4, d->poly);
+}
+
+/* Fast implementation of AdvSIMD tanf.
+   Maximum error is 3.45 ULP:
+   __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			    want 0x1.ff9850p-1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t special_arg = x;
+
+  /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
+     regression.  */
+#if WANT_SIMD_EXCEPT
+  uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
+  /* If fp exceptions are to be triggered correctly, also special-case tiny
+     input, as this will load to overflow later. Fix any special lanes to 1 to
+     prevent any exceptions being triggered.  */
+  uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh);
+  if (unlikely (v_any_u32 (special)))
+    x = vbslq_f32 (special, v_f32 (1.0f), x);
+#else
+  /* Otherwise, special-case large and special values.  */
+  uint32x4_t special = vcageq_f32 (x, d->range_val);
+#endif
+
+  /* n = rint(x/(pi/2)).  */
+  float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
+  float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
+  float32x4_t n = vsubq_f32 (q, d->shift);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
+
+  /* r = x - n * (pi/2)  (range reduction into -pi./4 .. pi/4).  */
+  float32x4_t r;
+  r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
+  r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
+  r = vfmaq_laneq_f32 (r, n, pi_consts, 2);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Invert sign of r if odd quadrant.  */
+  float32x4_t z = vmulq_f32 (r, vbslq_f32 (pred_alt, v_f32 (-1), v_f32 (1)));
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  float32x4_t z2 = vmulq_f32 (r, r);
+  float32x4_t p = eval_poly (z2, d);
+  float32x4_t y = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+  /* Compute reciprocal and apply if required.  */
+  float32x4_t inv_y = vdivq_f32 (v_f32 (1.0f), y);
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special);
+  return vbslq_f32 (pred_alt, inv_y, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (tan)
+
+TEST_SIG (V, F, 1, tan, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (tan), 2.96)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c
new file mode 100644
index 000000000000..3dc6e5527ffc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c
@@ -0,0 +1,67 @@
+/*
+ * Double-precision vector tanh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+  struct v_expm1_data d;
+  uint64x2_t thresh, tiny_bound;
+} data = {
+  .d = V_EXPM1_DATA,
+  .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27).  */
+  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
+  .thresh = V2 (0x01f241bf835f9d5f),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
+	      uint64x2_t special)
+{
+  return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+   version of expm1. The greatest observed error is 2.70 ULP:
+   _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
+				       want -0x1.be5452a6459fbp-3.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+  float64x2_t u = x;
+
+  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+  uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
+#if WANT_SIMD_EXCEPT
+  /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+     They will be fixed up later by the special-case handler.  */
+  if (unlikely (v_any_u64 (special)))
+    u = v_zerofy_f64 (u, special);
+#endif
+
+  u = vaddq_f64 (u, u);
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float64x2_t q = expm1_inline (u, &d->d);
+  float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, q, qp2, special);
+  return vdivq_f64 (q, qp2);
+}
+
+TEST_SIG (V, D, 1, tanh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (tanh), 2.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c
new file mode 100644
index 000000000000..18fe93c7e7ba
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c
@@ -0,0 +1,81 @@
+/*
+ * Single-precision vector tanh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+  struct v_expm1f_data expm1f_consts;
+  uint32x4_t boring_bound, large_bound, onef;
+} data = {
+  .expm1f_consts = V_EXPM1F_DATA,
+  /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for  negative).  */
+  .boring_bound = V4 (0x41102cb3),
+  .large_bound = V4 (0x7f800000),
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+	      float32x4_t q, uint32x4_t special)
+{
+  return v_call_f32 (
+      tanhf, x,
+      vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+      special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified
+   version of expm1f. The maximum error is 2.58 ULP:
+   _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
+				want 0x1.f9ba08p-5.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t sign = veorq_u32 (ix, iax);
+  uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
+  /* expm1 exponent bias is 1.0f reinterpreted to int.  */
+  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+      sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered properly, set all special and boring
+     lanes to 0, which will trigger no exceptions, and fix them up later.  */
+  uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound),
+				  vcltq_u32 (iax, v_u32 (0x34000000)));
+  x = v_zerofy_f32 (x, is_boring);
+  if (unlikely (v_any_u32 (special)))
+    x = v_zerofy_f32 (x, special);
+#else
+  uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
+#endif
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+			 special);
+
+  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+  return vbslq_f32 (is_boring, boring, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (tanh)
+
+TEST_SIG (V, F, 1, tanh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (tanh), 2.09)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c
new file mode 100644
index 000000000000..16de00ad5556
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c
@@ -0,0 +1,88 @@
+/*
+ * Double-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpi_data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10, c12;
+  double c1, c3, c5, c7, c9, c11, c13, c14;
+} tanpi_data = {
+  /* Coefficents for tan(pi * x) computed with fpminimax
+     on [ 0x1p-1022 0x1p-2 ]
+     approx rel error: 0x1.7eap-55
+     approx abs error: 0x1.7eap-55.  */
+  .c0 = V2 (0x1.921fb54442d18p1), /* pi.  */
+  .c1 = 0x1.4abbce625be52p3,	  .c2 = V2 (0x1.466bc6775b0f9p5),
+  .c3 = 0x1.45fff9b426f5ep7,	  .c4 = V2 (0x1.45f4730dbca5cp9),
+  .c5 = 0x1.45f3265994f85p11,	  .c6 = V2 (0x1.45f4234b330cap13),
+  .c7 = 0x1.45dca11be79ebp15,	  .c8 = V2 (0x1.47283fc5eea69p17),
+  .c9 = 0x1.3a6d958cdefaep19,	  .c10 = V2 (0x1.927896baee627p21),
+  .c11 = -0x1.89333f6acd922p19,	  .c12 = V2 (0x1.5d4e912bb8456p27),
+  .c13 = -0x1.a854d53ab6874p29,	  .c14 = 0x1.1b76de7681424p32,
+};
+
+/* Approximation for double-precision vector tanpi(x)
+   The maximum error is 3.06 ULP:
+   _ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3
+				       want -0x1.fa30112702c95p+3.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x)
+{
+  const struct v_tanpi_data *d = ptr_barrier (&tanpi_data);
+
+  float64x2_t n = vrndnq_f64 (x);
+
+  /* inf produces nan that propagates.  */
+  float64x2_t xr = vsubq_f64 (x, n);
+  float64x2_t ar = vabdq_f64 (x, n);
+  uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25));
+  float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar);
+
+  /* Order-14 pairwise Horner.  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t r4 = vmulq_f64 (r2, r2);
+
+  float64x2_t c_1_3 = vld1q_f64 (&d->c1);
+  float64x2_t c_5_7 = vld1q_f64 (&d->c5);
+  float64x2_t c_9_11 = vld1q_f64 (&d->c9);
+  float64x2_t c_13_14 = vld1q_f64 (&d->c13);
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1);
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0);
+
+  float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1);
+  p = vfmaq_f64 (p1011, r4, p);
+  p = vfmaq_f64 (p89, r4, p);
+  p = vfmaq_f64 (p67, r4, p);
+  p = vfmaq_f64 (p45, r4, p);
+  p = vfmaq_f64 (p23, r4, p);
+  p = vfmaq_f64 (p01, r4, p);
+  p = vmulq_f64 (r, p);
+
+  float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p);
+  float64x2_t y = vbslq_f64 (flip, p_recip, p);
+
+  uint64x2_t sign
+      = veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar));
+  return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (V_NAME_D1 (tanpi))
+TEST_ULP (V_NAME_D1 (tanpi), 2.57)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c
new file mode 100644
index 000000000000..7bd6d206819f
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c
@@ -0,0 +1,70 @@
+/*
+ * Single-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpif_data
+{
+  float32x4_t c0, c2, c4, c6;
+  float c1, c3, c5, c7;
+} tanpif_data = {
+  /* Coefficents for tan(pi * x).  */
+  .c0 = V4 (0x1.921fb4p1f),  .c1 = 0x1.4abbcep3f,      .c2 = V4 (0x1.466b8p5f),
+  .c3 = 0x1.461c72p7f,	     .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f,
+  .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f,
+};
+
+/* Approximation for single-precision vector tanpi(x)
+   The maximum error is 3.34 ULP:
+   _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2
+				 want 0x1.f70aa6p+2.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x)
+{
+  const struct v_tanpif_data *d = ptr_barrier (&tanpif_data);
+
+  float32x4_t n = vrndnq_f32 (x);
+
+  /* inf produces nan that propagates.  */
+  float32x4_t xr = vsubq_f32 (x, n);
+  float32x4_t ar = vabdq_f32 (x, n);
+  uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f));
+  float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar);
+
+  /* Order-7 pairwise Horner polynomial evaluation scheme.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t r4 = vmulq_f32 (r2, r2);
+
+  float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3);
+  float32x4_t p = vfmaq_f32 (p45, r4, p67);
+  p = vfmaq_f32 (p23, r4, p);
+  p = vfmaq_f32 (p01, r4, p);
+
+  p = vmulq_f32 (r, p);
+  float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p);
+  float32x4_t y = vbslq_f32 (flip, p_recip, p);
+
+  uint32x4_t sign
+      = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar));
+  return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign));
+}
+
+HALF_WIDTH_ALIAS_F1 (tanpi)
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (V_NAME_F1 (tanpi))
+TEST_ULP (V_NAME_F1 (tanpi), 2.84)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h
new file mode 100644
index 000000000000..797d217820c3
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h
@@ -0,0 +1,58 @@
+/*
+ * Helper for single-precision routines which calculate exp(ax) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPF_INLINE_H
+#define MATH_V_EXPF_INLINE_H
+
+#include "v_math.h"
+
+struct v_expf_data
+{
+  float ln2_hi, ln2_lo, c0, c2;
+  float32x4_t inv_ln2, c1, c3, c4;
+  /* asuint(1.0f).  */
+  uint32x4_t exponent_bias;
+};
+
+/* maxerr: 1.45358 +0.5 ulp.  */
+#define V_EXPF_DATA                                                           \
+  {                                                                           \
+    .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f,    \
+    .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f),                     \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+    .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000),         \
+  }
+
+static inline float32x4_t
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+{
+  /* Helper routine for calculating exp(ax).
+     Copied from v_expf.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     ax = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  float32x4_t ax = vabsq_f32 (x);
+  float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
+  float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
+  r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+  /* Custom order-4 Estrin avoids building high order monomial.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (d->c4, r);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);
+  return vfmaq_f32 (scale, poly, scale);
+}
+
+#endif // MATH_V_EXPF_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h
new file mode 100644
index 000000000000..82d2e9415d93
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h
@@ -0,0 +1,86 @@
+/*
+ * Helper for double-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPM1_INLINE_H
+#define MATH_V_EXPM1_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1_data
+{
+  float64x2_t c2, c4, c6, c8;
+  float64x2_t invln2;
+  int64x2_t exponent_bias;
+  double c1, c3, c5, c7, c9, c10;
+  double ln2[2];
+};
+
+/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2].  */
+#define V_EXPM1_DATA                                                          \
+  {                                                                           \
+    .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5),              \
+    .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10),             \
+    .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16),            \
+    .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22),            \
+    .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29,                \
+    .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },                   \
+    .invln2 = V2 (0x1.71547652b82fep0),                                       \
+    .exponent_bias = V2 (0x3ff0000000000000),                                 \
+  }
+
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct v_expm1_data *d)
+{
+  /* Helper routine for calculating exp(x) - 1.  */
+
+  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
+  int64x2_t i = vcvtq_s64_f64 (n);
+  float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+  f = vfmsq_laneq_f64 (f, n, ln2, 1);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t f4 = vmulq_f64 (f2, f2);
+  float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
+  float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
+  float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
+  float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
+  float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
+  float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
+  p = vfmaq_f64 (p47, f4, p);
+  p = vfmaq_f64 (p03, f4, p);
+
+  p = vfmaq_f64 (f, f2, p);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+  float64x2_t t = vreinterpretq_f64_s64 (u);
+
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
+#endif // MATH_V_EXPM1_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h
new file mode 100644
index 000000000000..463b07aa7705
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h
@@ -0,0 +1,62 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPM1F_INLINE_H
+#define MATH_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1f_data
+{
+  float32x4_t c0, c2;
+  int32x4_t exponent_bias;
+  float c1, c3, inv_ln2, c4;
+  float ln2_hi, ln2_lo;
+};
+
+/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+   log(2)/2]. Exponent bias is asuint(1.0f).  */
+#define V_EXPM1F_DATA                                                         \
+  {                                                                           \
+    .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5),  \
+    .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10,                                \
+    .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f,              \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+  }
+
+static inline float32x4_t
+expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+{
+  /* Helper routine for calculating exp(x) - 1.  */
+
+  float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+  float32x4_t lane_consts = vld1q_f32 (&d->c1);
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
+  int32x4_t i = vcvtq_s32_f32 (j);
+  float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+  f = vfmsq_lane_f32 (f, j, ln2, 1);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).  */
+  float32x4_t f2 = vmulq_f32 (f, f);
+  float32x4_t f4 = vmulq_f32 (f2, f2);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+  float32x4_t p = vfmaq_f32 (p01, f2, p23);
+  p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
+  p = vfmaq_f32 (f, f2, p);
+
+  /* t = 2^i.  */
+  int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+  float32x4_t t = vreinterpretq_f32_s32 (u);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+}
+
+#endif // MATH_V_EXPM1F_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h
new file mode 100644
index 000000000000..ef906ae4b603
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h
@@ -0,0 +1,119 @@
+/*
+ * Helper for vector double-precision routines which calculate log(1 + x) and
+ * do not need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef MATH_V_LOG1P_INLINE_H
+#define MATH_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+
+struct v_log1p_data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
+  uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+  int64x2_t one_top;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+  double ln2[2];
+};
+
+/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].  */
+#define V_LOG1P_CONSTANTS_TABLE                                               \
+  {                                                                           \
+    .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2,             \
+    .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3,             \
+    .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3,             \
+    .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4,             \
+    .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4,             \
+    .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4,           \
+    .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4,           \
+    .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5,           \
+    .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4,           \
+    .c18 = -0x1.cfa7385bdb37ep-6,                                             \
+    .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },                   \
+    .hf_rt2_top = V2 (0x3fe6a09e00000000),                                    \
+    .one_m_hf_rt2_top = V2 (0x00095f6200000000),                              \
+    .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff)                   \
+  }
+
+#define BottomMask v_u64 (0xffffffff)
+
+static inline float64x2_t
+eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+  float64x2_t c1315 = vld1q_f64 (&d->c13);
+  float64x2_t c1718 = vld1q_f64 (&d->c17);
+  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
+  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
+  float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
+  p = vfmaq_f64 (p1415, m2, p);
+  p = vfmaq_f64 (p1213, m2, p);
+  p = vfmaq_f64 (p1011, m2, p);
+  p = vfmaq_f64 (p89, m2, p);
+  p = vfmaq_f64 (p67, m2, p);
+  p = vfmaq_f64 (p45, m2, p);
+  p = vfmaq_f64 (p23, m2, p);
+  return vfmaq_f64 (p01, m2, p);
+}
+
+static inline float64x2_t
+log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+{
+  /* Helper for calculating log(x + 1):
+     - No special-case handling - this should be dealt with by the caller.
+     - Optionally simulate the shortcut for k=0, used in the scalar routine,
+       using v_sel, for improved accuracy when the argument to log1p is close
+       to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
+       in the source of the caller before including this file.  */
+  float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
+  uint64x2_t mi = vreinterpretq_u64_f64 (m);
+  uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+
+  int64x2_t ki
+      = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+  float64x2_t k = vcvtq_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+  uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
+
+  /* Correction term c/m.  */
+  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+# error                                                                       \
+      "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+  /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+     that the approximation is solely the polynomial.  */
+  uint64x2_t k0 = vceqzq_f64 (k);
+  cm = v_zerofy_f64 (cm, k0);
+  f = vbslq_f64 (k0, x, f);
+#endif
+
+  /* Approximate log1p(f) on the reduced input using a polynomial.  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t p = eval_poly (f, f2, d);
+
+  /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+  float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
+  float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
+  return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
+}
+
+#endif // MATH_V_LOG1P_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h
new file mode 100644
index 000000000000..e81fa24486ae
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h
@@ -0,0 +1,94 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_LOG1PF_INLINE_H
+#define MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+
+struct v_log1pf_data
+{
+  uint32x4_t four;
+  int32x4_t three_quarters;
+  float c0, c3, c5, c7;
+  float32x4_t c4, c6, c1, c2, ln2;
+};
+
+/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+   (1, -0.5) are not stored as they can be generated more efficiently.  */
+#define V_LOG1PF_CONSTANTS_TABLE                                              \
+  {                                                                           \
+    .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f),                         \
+    .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f,                         \
+    .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f,                          \
+    .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f,                          \
+    .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                       \
+    .three_quarters = V4 (0x3f400000)                                         \
+  }
+
+static inline float32x4_t
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
+  float32x4_t c0357 = vld1q_f32 (&d->c0);
+  float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
+  float32x4_t m2 = vmulq_f32 (m, m);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+  float32x4_t p = vfmaq_f32 (p45, m2, p67);
+  p = vfmaq_f32 (p23, m2, p);
+  p = vfmaq_f32 (d->c1, m, p);
+  p = vmulq_f32 (m2, p);
+  p = vfmaq_f32 (m, m2, p);
+  return vfmaq_f32 (p, m2, q);
+}
+
+static inline float32x4_t
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
+{
+  /* Helper for calculating log(x + 1).  */
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+  float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+  int32x4_t k
+      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+		   v_s32 (0xff800000));
+  uint32x4_t ku = vreinterpretq_u32_s32 (k);
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+  /* Scale x by exponent manipulation.  */
+  float32x4_t m_scale
+      = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+  m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+
+  /* Evaluate polynomial on the reduced interval.  */
+  float32x4_t p = eval_poly (m_scale, d);
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+  float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+
+  /* Apply the scaling back.  */
+  return vfmaq_f32 (p, scale_back, d->ln2);
+}
+
+#endif //  MATH_V_LOG1PF_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h
new file mode 100644
index 000000000000..770f9e81c195
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h
@@ -0,0 +1,104 @@
+/*
+ * Double-precision vector log(x) function - inline version
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "math_config.h"
+
+#ifndef V_LOG_INLINE_POLY_ORDER
+#  error Cannot use inline log helper without specifying poly order (options are 4 or 5)
+#endif
+
+#if V_LOG_INLINE_POLY_ORDER == 4
+#  define POLY                                                                \
+    {                                                                         \
+      V2 (-0x1.ffffffffcbad3p-2), V2 (0x1.555555578ed68p-2),                  \
+	  V2 (-0x1.0000d3a1e7055p-2), V2 (0x1.999392d02a63ep-3)               \
+    }
+#elif V_LOG_INLINE_POLY_ORDER == 5
+#  define POLY                                                                \
+    {                                                                         \
+      V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),                  \
+	  V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),              \
+	  V2 (-0x1.554e550bd501ep-3)                                          \
+    }
+#else
+#  error Can only choose order 4 or 5 for log poly
+#endif
+
+struct v_log_inline_data
+{
+  float64x2_t poly[V_LOG_INLINE_POLY_ORDER];
+  float64x2_t ln2;
+  uint64x2_t off, sign_exp_mask;
+};
+
+#define V_LOG_CONSTANTS                                                       \
+  {                                                                           \
+    .poly = POLY, .ln2 = V2 (0x1.62e42fefa39efp-1),                           \
+    .sign_exp_mask = V2 (0xfff0000000000000), .off = V2 (0x3fe6900900000000)  \
+  }
+
+#define A(i) d->poly[i]
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+log_lookup (uint64x2_t i)
+{
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static inline float64x2_t
+v_log_inline (float64x2_t x, const struct v_log_inline_data *d)
+{
+  float64x2_t z, r, r2, p, y, kd, hi;
+  uint64x2_t ix, iz, tmp;
+  int64x2_t k;
+  struct entry e;
+
+  ix = vreinterpretq_u64_f64 (x);
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = vsubq_u64 (ix, d->off);
+  k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
+  iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+  z = vreinterpretq_f64_u64 (iz);
+  e = log_lookup (tmp);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  kd = vcvtq_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  r2 = vmulq_f64 (r, r);
+  y = vfmaq_f64 (A (2), A (3), r);
+  p = vfmaq_f64 (A (0), A (1), r);
+#if V_LOG_POLY_ORDER == 5
+  y = vfmaq_f64 (y, A (4), r2);
+#endif
+  y = vfmaq_f64 (p, y, r2);
+
+  return vfmaq_f64 (hi, y, r2);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h
new file mode 100644
index 000000000000..75cd71cc87a7
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h
@@ -0,0 +1,202 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#if !__aarch64__
+# error "Cannot build without AArch64"
+#endif
+
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f
+#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun
+
+#if USE_GLIBC_ABI
+
+# define HALF_WIDTH_ALIAS_F1(fun)                                             \
+    float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x)                   \
+    {                                                                         \
+      return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x)));          \
+    }
+
+# define HALF_WIDTH_ALIAS_F2(fun)                                             \
+    float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y)   \
+    {                                                                         \
+      return vget_low_f32 (                                                   \
+	  _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y)));     \
+    }
+
+#else
+# define HALF_WIDTH_ALIAS_F1(fun)
+# define HALF_WIDTH_ALIAS_F2(fun)
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+#include <arm_neon.h>
+
+/* Shorthand helpers for declaring constants.  */
+#define V2(X)                                                                 \
+  {                                                                           \
+    X, X                                                                      \
+  }
+#define V4(X)                                                                 \
+  {                                                                           \
+    X, X, X, X                                                                \
+  }
+#define V8(X)                                                                 \
+  {                                                                           \
+    X, X, X, X, X, X, X, X                                                    \
+  }
+
+static inline int
+v_any_u16h (uint16x4_t x)
+{
+  return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
+}
+
+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
+static inline float32x4_t
+v_f32 (float x)
+{
+  return (float32x4_t) V4 (x);
+}
+static inline uint32x4_t
+v_u32 (uint32_t x)
+{
+  return (uint32x4_t) V4 (x);
+}
+static inline int32x4_t
+v_s32 (int32_t x)
+{
+  return (int32x4_t) V4 (x);
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (uint32x4_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+static inline int
+v_any_u32h (uint32x2_t x)
+{
+  return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
+}
+static inline float32x4_t
+v_lookup_f32 (const float *tab, uint32x4_t idx)
+{
+  return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
+}
+static inline uint32x4_t
+v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
+{
+  return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
+}
+static inline float32x4_t
+v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
+{
+  return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+			p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] };
+}
+static inline float32x4_t
+v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
+	     float32x4_t y, uint32x4_t p)
+{
+  return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0],
+			p[1] ? f (x1[1], x2[1]) : y[1],
+			p[2] ? f (x1[2], x2[2]) : y[2],
+			p[3] ? f (x1[3], x2[3]) : y[3] };
+}
+static inline float32x4_t
+v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
+{
+  return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
+static inline float64x2_t
+v_f64 (double x)
+{
+  return (float64x2_t) V2 (x);
+}
+static inline uint64x2_t
+v_u64 (uint64_t x)
+{
+  return (uint64x2_t) V2 (x);
+}
+static inline int64x2_t
+v_s64 (int64_t x)
+{
+  return (int64x2_t) V2 (x);
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (uint64x2_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (x) != 0;
+}
+static inline float64x2_t
+v_lookup_f64 (const double *tab, uint64x2_t idx)
+{
+  return (float64x2_t){ tab[idx[0]], tab[idx[1]] };
+}
+static inline uint64x2_t
+v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
+{
+  return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
+}
+static inline float64x2_t
+v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
+{
+  double p1 = p[1];
+  double x1 = x[1];
+  if (likely (p[0]))
+    y[0] = f (x[0]);
+  if (likely (p1))
+    y[1] = f (x1);
+  return y;
+}
+
+static inline float64x2_t
+v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
+	     float64x2_t y, uint64x2_t p)
+{
+  double p1 = p[1];
+  double x1h = x1[1];
+  double x2h = x2[1];
+  if (likely (p[0]))
+    y[0] = f (x1[0], x2[0]);
+  if (likely (p1))
+    y[1] = f (x1h, x2h);
+  return y;
+}
+static inline float64x2_t
+v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
+{
+  return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
+}
+
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h
new file mode 100644
index 000000000000..9a9c5c1ac15b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h
@@ -0,0 +1,24 @@
+/*
+ * Helpers for evaluating polynomials on single-precision AdvSIMD input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_POLY_ADVSIMD_F32_H
+#define MATH_POLY_ADVSIMD_F32_H
+
+#include <arm_neon.h>
+
+/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form:
+   v_[scheme]_[order]_f32.  */
+#define VTYPE float32x4_t
+#define FMA(x, y, z) vfmaq_f32 (z, x, y)
+#define VWRAP(f) v_##f##_f32
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h
new file mode 100644
index 000000000000..4331bfbd03b0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h
@@ -0,0 +1,24 @@
+/*
+ * Helpers for evaluating polynomials on double-precision AdvSIMD input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_POLY_ADVSIMD_F64_H
+#define MATH_POLY_ADVSIMD_F64_H
+
+#include <arm_neon.h>
+
+/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form:
+   v_[scheme]_[order]_f64.  */
+#define VTYPE float64x2_t
+#define FMA(x, y, z) vfmaq_f64 (z, x, y)
+#define VWRAP(f) v_##f##_f64
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h
new file mode 100644
index 000000000000..14227d9339a8
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h
@@ -0,0 +1,86 @@
+/*
+ * Core approximation for double-precision vector sincos
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f64.h"
+
+static const struct v_sincos_data
+{
+  float64x2_t sin_poly[7], cos_poly[6], pio2[3];
+  float64x2_t inv_pio2, shift, range_val;
+} v_sincos_data = {
+  .inv_pio2 = V2 (0x1.45f306dc9c882p-1),
+  .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26),
+	    V2 (0x1.1a62633145c07p-54) },
+  .shift = V2 (0x1.8p52),
+  .sin_poly = { /* Computed using Remez in [-pi/2, pi/2].  */
+	        V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+		V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+		V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+		V2 (-0x1.9e9540300a1p-41) },
+  .cos_poly = { /* Computed using Remez in [-pi/4, pi/4].  */
+	        V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10),
+		V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22),
+		V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) },
+  .range_val = V2 (0x1p23), };
+
+static inline uint64x2_t
+check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d)
+{
+  return vcagtq_f64 (x, d->range_val);
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+   one function call, using shared argument reduction and separate polynomials.
+   Largest observed error is for sin, 3.22 ULP:
+   v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+				       want -0x1.ffe9537d5dbb4p-3.  */
+static inline float64x2x2_t
+v_sincos_inline (float64x2_t x, const struct v_sincos_data *d)
+{
+  /* q = nearest integer to 2 * x / pi.  */
+  float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift);
+  int64x2_t n = vcvtq_s64_f64 (q);
+
+  /* Use q to reduce x to r in [-pi/4, pi/4], by:
+     r = x - q * pi/2, in extended precision.  */
+  float64x2_t r = x;
+  r = vfmsq_f64 (r, q, d->pio2[0]);
+  r = vfmsq_f64 (r, q, d->pio2[1]);
+  r = vfmsq_f64 (r, q, d->pio2[2]);
+
+  float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2;
+
+  /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2).  */
+  float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly);
+  s = vfmaq_f64 (r, r3, s);
+
+  /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2).  */
+  float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly);
+  c = vfmaq_f64 (v_f64 (-0.5), r2, c);
+  c = vfmaq_f64 (v_f64 (1), r2, c);
+
+  /* If odd quadrant, swap cos and sin.  */
+  uint64x2_t swap = vtstq_s64 (n, v_s64 (1));
+  float64x2_t ss = vbslq_f64 (swap, c, s);
+  float64x2_t cc = vbslq_f64 (swap, s, c);
+
+  /* Fix signs according to quadrant.
+     ss = asdouble(asuint64(ss) ^ ((n       & 2) << 62))
+     cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)).  */
+  uint64x2_t sin_sign
+      = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62);
+  uint64x2_t cos_sign = vshlq_n_u64 (
+      vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)),
+      62);
+  ss = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign));
+  cc = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign));
+
+  return (float64x2x2_t){ ss, cc };
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h
new file mode 100644
index 000000000000..7c29eded14d6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h
@@ -0,0 +1,84 @@
+/*
+ * Core approximation for single-precision vector sincos
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+const static struct v_sincosf_data
+{
+  float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val;
+} v_sincosf_data = {
+  .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4].  */
+	        V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) },
+  .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4].  */
+	        V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) },
+  .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) },
+  .inv_pio2 = V4 (0x1.45f306p-1f),
+  .shift = V4 (0x1.8p23),
+  .range_val = V4 (0x1p20),
+};
+
+static inline uint32x4_t
+check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d)
+{
+  return vcagtq_f32 (x, d->range_val);
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+   one function call, using shared argument reduction and separate low-order
+   polynomials.
+   Worst-case error for sin is 1.67 ULP:
+   v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+   Worst-case error for cos is 1.81 ULP:
+   v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6.  */
+static inline float32x4x2_t
+v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d)
+{
+  /* n = rint ( x / (pi/2) ).  */
+  float32x4_t shift = d->shift;
+  float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2);
+  q = vsubq_f32 (q, shift);
+  int32x4_t n = vcvtq_s32_f32 (q);
+
+  /* Reduce x such that r is in [ -pi/4, pi/4 ].  */
+  float32x4_t r = x;
+  r = vfmsq_f32 (r, q, d->pio2[0]);
+  r = vfmsq_f32 (r, q, d->pio2[1]);
+  r = vfmsq_f32 (r, q, d->pio2[2]);
+
+  /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2).  */
+  float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2);
+  float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]);
+  s = vfmaq_f32 (d->poly_sin[0], r2, s);
+  s = vfmaq_f32 (r, r3, s);
+
+  /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2).  */
+  float32x4_t r4 = vmulq_f32 (r2, r2);
+  float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]);
+  float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]);
+  c = vfmaq_f32 (c, r4, p);
+  c = vfmaq_f32 (v_f32 (1), c, r2);
+
+  /* If odd quadrant, swap cos and sin.  */
+  uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1));
+  float32x4_t ss = vbslq_f32 (swap, c, s);
+  float32x4_t cc = vbslq_f32 (swap, s, c);
+
+  /* Fix signs according to quadrant.
+     ss = asfloat(asuint(ss) ^ ((n       & 2) << 30))
+     cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)).  */
+  uint32x4_t sin_sign
+      = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30);
+  uint32x4_t cos_sign = vshlq_n_u32 (
+      vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)),
+      30);
+  ss = vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign));
+  cc = vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign));
+
+  return (float32x4x2_t){ ss, cc };
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h
new file mode 100644
index 000000000000..438b141b9174
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h
@@ -0,0 +1,64 @@
+/*
+ * Helper for Double-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "v_poly_f64.h"
+
+static const struct v_sincospi_data
+{
+  float64x2_t poly[10], range_val;
+} v_sincospi_data = {
+  /* Polynomial coefficients generated using Remez algorithm,
+     see sinpi.sollya for details.  */
+  .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+	    V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+	    V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+	    V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+	    V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+  .range_val = V2 (0x1p63),
+};
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+   one function call, using separate argument reduction and shared low-order
+   polynomials.
+   Approximation for vector double-precision sincospi(x).
+   Maximum Error 3.09 ULP:
+  _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+					      want 0x1.fd54d0b327cf4p-1
+   Maximum Error 3.16 ULP:
+  _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+					      want 0x1.fd2da484ff402p-1.  */
+static inline float64x2x2_t
+v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d)
+{
+  /* If r is odd, the sign of the result should be inverted for sinpi
+     and reintroduced for cospi.  */
+  uint64x2_t cmp = vcgeq_f64 (x, d->range_val);
+  uint64x2_t odd = vshlq_n_u64 (
+      vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63);
+
+  /* r = x - rint(x).  */
+  float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x));
+  /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2.  */
+  float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr));
+
+  /* Pairwise Horner approximation for y = sin(r * pi).  */
+  float64x2_t sr2 = vmulq_f64 (sr, sr);
+  float64x2_t sr4 = vmulq_f64 (sr2, sr2);
+  float64x2_t cr2 = vmulq_f64 (cr, cr);
+  float64x2_t cr4 = vmulq_f64 (cr2, cr2);
+
+  float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr);
+  float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr);
+
+  float64x2_t sinpix
+      = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd));
+
+  float64x2_t cospix
+      = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd));
+
+  return (float64x2x2_t){ sinpix, cospix };
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h
new file mode 100644
index 000000000000..8d4177dd871e
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h
@@ -0,0 +1,57 @@
+/*
+ * Helper for Single-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f32.h"
+
+const static struct v_sincospif_data
+{
+  float32x4_t poly[6], range_val;
+} v_sincospif_data = {
+  /* Taylor series coefficents for sin(pi * x).  */
+  .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+	    V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+  .range_val = V4 (0x1p31f),
+};
+
+/* Single-precision vector function allowing calculation of both sinpi and
+   cospi in one function call, using shared argument reduction and polynomials.
+   Worst-case error for sin is 3.04 ULP:
+   _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+   Worst-case error for cos is 3.18 ULP:
+   _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ */
+static inline float32x4x2_t
+v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d)
+{
+  /* If r is odd, the sign of the result should be inverted for sinpi and
+     reintroduced for cospi.  */
+  uint32x4_t cmp = vcgeq_f32 (x, d->range_val);
+  uint32x4_t odd = vshlq_n_u32 (
+      vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31);
+
+  /* r = x - rint(x).  */
+  float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x));
+  /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2.  */
+  float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr));
+
+  /* Pairwise Horner approximation for y = sin(r * pi).  */
+  float32x4_t sr2 = vmulq_f32 (sr, sr);
+  float32x4_t sr4 = vmulq_f32 (sr2, sr2);
+  float32x4_t cr2 = vmulq_f32 (cr, cr);
+  float32x4_t cr4 = vmulq_f32 (cr2, cr2);
+
+  float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr);
+  float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr);
+
+  float32x4_t sinpix
+      = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd));
+  float32x4_t cospix
+      = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd));
+
+  return (float32x4x2_t){ sinpix, cospix };
+}