aboutsummaryrefslogtreecommitdiff
path: root/contrib/arm-optimized-routines/math/aarch64/advsimd
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/arm-optimized-routines/math/aarch64/advsimd')
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c122
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c115
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c65
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c78
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c130
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c106
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c242
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c89
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c135
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c171
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c127
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c109
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c75
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c90
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c127
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c117
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c47
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c49
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c92
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c89
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c107
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c92
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c87
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c86
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c166
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c205
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c174
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c120
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c134
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c147
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c147
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c128
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c122
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c73
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c130
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c79
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c77
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c82
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h361
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c95
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c96
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/log.c118
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c132
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c106
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c61
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c92
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c123
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c102
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c88
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c33
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c34
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c284
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c209
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c105
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c67
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c68
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c44
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c43
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c92
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c80
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c84
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c87
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c84
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c122
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c130
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c67
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c81
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c88
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c70
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h58
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h86
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h62
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h119
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h94
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h104
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h202
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h24
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h24
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h86
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h84
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h64
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h57
82 files changed, 8641 insertions, 0 deletions
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c
new file mode 100644
index 000000000000..7873a07e6f56
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision vector acos(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t poly[12];
+ float64x2_t pi, pi_over_2;
+ uint64x2_t abs_mask;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
+ V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
+ V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
+ V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
+ V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
+ V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
+ .pi = V2 (0x1.921fb54442d18p+1),
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .abs_mask = V2 (0x7fffffffffffffff),
+};
+
+#define AllMask v_u64 (0xffffffffffffffff)
+#define Oneu 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-53. */
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (acos, x, y, special);
+}
+#endif
+
+/* Double-precision implementation of vector acos(x).
+
+ For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
+ rounding.
+ If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
+ approximation.
+
+ For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.18 ulps,
+ _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
+ want 0x1.0d54d1985c069p+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.52 ulps,
+ _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
+ want 0x1.edbbedf8a7d6cp-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+ /* A single comparison for One, Small and QNaN. */
+ uint64x2_t special
+ = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
+ v_u64 (Oneu - Small));
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, x, AllMask);
+#endif
+
+ uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5));
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x),
+ vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax));
+ float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float64x2_t z4 = vmulq_f64 (z2, z2);
+ float64x2_t z8 = vmulq_f64 (z4, z4);
+ float64x2_t z16 = vmulq_f64 (z8, z8);
+ float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ float64x2_t y = vbslq_f64 (d->abs_mask, p, x);
+
+ uint64x2_t is_neg = vcltzq_f64 (x);
+ float64x2_t off = vreinterpretq_f64_u64 (
+ vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi)));
+ float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0));
+ float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off);
+
+ return vfmaq_f64 (add, mul, y);
+}
+
+TEST_SIG (V, D, 1, acos, -1.0, 1.0)
+TEST_ULP (V_NAME_D1 (acos), 1.02)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
+TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c
new file mode 100644
index 000000000000..e200f792c764
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c
@@ -0,0 +1,115 @@
+/*
+ * Single-precision vector acos(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t pi_over_2f, pif;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+ V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+ .pi_over_2f = V4 (0x1.921fb6p+0f),
+ .pif = V4 (0x1.921fb6p+1f),
+};
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Small 0x32800000 /* 2^-26. */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (acosf, x, y, special);
+}
+#endif
+
+/* Single-precision implementation of vector acos(x).
+
+ For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
+ rounding.
+ If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
+ approximation.
+
+ For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.26 ulps,
+ _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.32 ulps,
+ _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
+ want 0x1.feb32ep-1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+#if WANT_SIMD_EXCEPT
+ /* A single comparison for One, Small and QNaN. */
+ uint32x4_t special
+ = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, x, v_u32 (0xffffffff));
+#endif
+
+ float32x4_t ax = vreinterpretq_f32_u32 (ia);
+ uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half));
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x),
+ vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+ float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float32x4_t p = v_horner_4_f32 (z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x);
+
+ uint32x4_t is_neg = vcltzq_f32 (x);
+ float32x4_t off = vreinterpretq_f32_u32 (
+ vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg));
+ float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0));
+ float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off);
+
+ return vfmaq_f32 (add, mul, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (acos)
+
+TEST_SIG (V, F, 1, acos, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (acos), 0.82)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c
new file mode 100644
index 000000000000..55d8ed5a421e
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c
@@ -0,0 +1,65 @@
+/*
+ * Double-precision vector acosh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 1
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+ struct v_log1p_data log1p_consts;
+ uint64x2_t one, thresh;
+} data = {
+ .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+ .one = V2 (0x3ff0000000000000),
+ .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1). */
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special,
+ const struct v_log1p_data *d)
+{
+ return v_call_f64 (acosh, x, log1p_inline (y, d), special);
+}
+
+/* Vector approximation for double-precision acosh, based on log1p.
+ The largest observed error is 3.02 ULP in the region where the
+ argument to log1p falls in the k=0 interval, i.e. x close to 1:
+ _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
+ want 0x1.f2d6d823bc9e2p-5. */
+VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t special
+ = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh);
+ float64x2_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u64 (special)))
+ x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
+#endif
+
+ float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
+ float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
+ y = vmulq_f64 (y, xm1);
+ y = vsqrtq_f64 (y);
+ y = vaddq_f64 (xm1, y);
+
+ if (unlikely (v_any_u64 (special)))
+ return special_case (special_arg, y, special, &d->log1p_consts);
+ return log1p_inline (y, &d->log1p_consts);
+}
+
+TEST_SIG (V, D, 1, acosh, 1.0, 10.0)
+TEST_ULP (V_NAME_D1 (acosh), 2.53)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
+TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
+TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
+TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c
new file mode 100644
index 000000000000..029d457cfa8a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision vector acosh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+#define SquareLim 0x1p64
+
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ uint32x4_t one;
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
+
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
+ const struct v_log1pf_data *d)
+{
+ return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
+}
+
+/* Vector approximation for single-precision acosh, based on log1p. Maximum
+ error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+ is 3.00 ULP:
+ _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+ want 0x1.ef0a7cp-4.
+ With exceptions disabled, we can compute u with a shorter dependency chain,
+ which gives maximum error of 3.22 ULP:
+ _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+ want 0x1.fdcdd2p-5. */
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
+
+#if WANT_SIMD_EXCEPT
+ /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+ only xm1 to calculate u, as operating on x will trigger invalid for NaN.
+ Widening sign-extend special predicate in order to mask with it. */
+ uint32x4_t p
+ = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special)));
+ float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
+ float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
+#else
+ float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+ float32x4_t u
+ = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
+#endif
+
+ float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, y, special, &d->log1pf_consts);
+ return log1pf_inline (y, &d->log1pf_consts);
+}
+
+HALF_WIDTH_ALIAS_F1 (acosh)
+
+TEST_SIG (V, F, 1, acosh, 1.0, 10.0)
+#if WANT_SIMD_EXCEPT
+TEST_ULP (V_NAME_F1 (acosh), 2.50)
+#else
+TEST_ULP (V_NAME_F1 (acosh), 2.78)
+#endif
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
+TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
+TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
+TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c
new file mode 100644
index 000000000000..c751d9264a12
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c
@@ -0,0 +1,130 @@
+/*
+ * Double-precision vector asin(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t c0, c2, c4, c6, c8, c10;
+ float64x2_t pi_over_2;
+ uint64x2_t abs_mask;
+ double c1, c3, c5, c7, c9, c11;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4,
+ .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6,
+ .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6,
+ .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7,
+ .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6,
+ .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6,
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
+};
+
+#define AllMask v_u64 (0xffffffffffffffff)
+#define One 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-12. */
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (asin, x, y, special);
+}
+#endif
+
+/* Double-precision implementation of vector asin(x).
+
+ For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+ rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+ following approximation.
+
+ For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 1.01 ulps,
+ _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2
+ want 0x1.ed78525a927eep-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.69 ulps,
+ _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+ want 0x1.1111dd54ddf99p-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t ax = vabsq_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+ /* Special values need to be computed with scalar fallbacks so
+ that appropriate exceptions are raised. */
+ uint64x2_t special
+ = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
+ v_u64 (One - Small));
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, x, AllMask);
+#endif
+
+ uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x),
+ vfmsq_n_f64 (v_f64 (0.5), ax, 0.5));
+ float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float64x2_t z4 = vmulq_f64 (z2, z2);
+ float64x2_t z8 = vmulq_f64 (z4, z4);
+ float64x2_t z16 = vmulq_f64 (z8, z8);
+
+ /* order-11 estrin. */
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+ float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+ float64x2_t p = vfmaq_f64 (p07, z16, p811);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0));
+
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
+}
+
+TEST_SIG (V, D, 1, asin, -1.0, 1.0)
+TEST_ULP (V_NAME_D1 (asin), 2.20)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
+TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c
new file mode 100644
index 000000000000..970feb37e1d5
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c
@@ -0,0 +1,106 @@
+/*
+ * Single-precision vector asin(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t pi_over_2f;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+ V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+ .pi_over_2f = V4 (0x1.921fb6p+0f),
+};
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Small 0x39800000 /* 2^-12. */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (asinf, x, y, special);
+}
+#endif
+
+/* Single-precision implementation of vector asin(x).
+
+ For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+ rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+ following approximation.
+
+ For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 0.83 ulps,
+ _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.41 ulps,
+ _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+#if WANT_SIMD_EXCEPT
+ /* Special values need to be computed with scalar fallbacks so
+ that appropriate fp exceptions are raised. */
+ uint32x4_t special
+ = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, x, v_u32 (0xffffffff));
+#endif
+
+ float32x4_t ax = vreinterpretq_f32_u32 (ia);
+ uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half));
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
+ vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+ float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float32x4_t p = v_horner_4_f32 (z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ float32x4_t y
+ = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0));
+
+ /* Copy sign. */
+ return vbslq_f32 (v_u32 (AbsMask), y, x);
+}
+
+HALF_WIDTH_ALIAS_F1 (asin)
+
+TEST_SIG (V, F, 1, asin, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (asin), 1.91)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c
new file mode 100644
index 000000000000..550302826bd9
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c
@@ -0,0 +1,242 @@
+/*
+ * Double-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "test_defs.h"
+#include "test_sig.h"
+#include "v_math.h"
+
+const static struct data
+{
+ uint64x2_t huge_bound, abs_mask, off, mask;
+#if WANT_SIMD_EXCEPT
+ float64x2_t tiny_bound;
+#endif
+ float64x2_t lc0, lc2;
+ double lc1, lc3, ln2, lc4;
+
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
+ double c1, c3, c5, c7, c9, c11, c13, c15;
+
+} data = {
+
+#if WANT_SIMD_EXCEPT
+ .tiny_bound = V2 (0x1p-26),
+#endif
+ /* Even terms of polynomial s.t. asinh(x) is approximated by
+ asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+ Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
+
+ .c0 = V2 (-0x1.55555555554a7p-3),
+ .c1 = 0x1.3333333326c7p-4,
+ .c2 = V2 (-0x1.6db6db68332e6p-5),
+ .c3 = 0x1.f1c71b26fb40dp-6,
+ .c4 = V2 (-0x1.6e8b8b654a621p-6),
+ .c5 = 0x1.1c4daa9e67871p-6,
+ .c6 = V2 (-0x1.c9871d10885afp-7),
+ .c7 = 0x1.7a16e8d9d2ecfp-7,
+ .c8 = V2 (-0x1.3ddca533e9f54p-7),
+ .c9 = 0x1.0becef748dafcp-7,
+ .c10 = V2 (-0x1.b90c7099dd397p-8),
+ .c11 = 0x1.541f2bb1ffe51p-8,
+ .c12 = V2 (-0x1.d217026a669ecp-9),
+ .c13 = 0x1.0b5c7977aaf7p-9,
+ .c14 = V2 (-0x1.e0f37daef9127p-11),
+ .c15 = 0x1.388b5fe542a6p-12,
+ .c16 = V2 (-0x1.021a48685e287p-14),
+ .c17 = V2 (0x1.93d4ba83d34dap-18),
+
+ .lc0 = V2 (-0x1.ffffffffffff7p-2),
+ .lc1 = 0x1.55555555170d4p-2,
+ .lc2 = V2 (-0x1.0000000399c27p-2),
+ .lc3 = 0x1.999b2e90e94cap-3,
+ .lc4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+
+ .off = V2 (0x3fe6900900000000),
+ .huge_bound = V2 (0x5fe0000000000000),
+ .abs_mask = V2 (0x7fffffffffffffff),
+ .mask = V2 (0xfffULL << 52),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
+ uint64x2_t special)
+{
+ /* Copy sign. */
+ y = vbslq_f64 (abs_mask, y, x);
+ return v_call_f64 (asinh, x, y, special);
+}
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static inline float64x2_t
+log_inline (float64x2_t xm, const struct data *d)
+{
+
+ uint64x2_t u = vreinterpretq_u64_f64 (xm);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
+ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
+ y = vfmaq_f64 (p, r2, y);
+ return vfmaq_f64 (hi, y, r2);
+}
+
+/* Double-precision implementation of vector asinh(x).
+ asinh is very sensitive around 1, so it is impractical to devise a single
+ low-cost algorithm which is sufficiently accurate on a wide range of input.
+ Instead we use two different algorithms:
+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
+ where log(x) is an optimized log approximation, and P(x) is a polynomial
+ shared with the scalar routine. The greatest observed error 2.79 ULP, in
+ |x| >= 1:
+ _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
+ want 0x1.ffffd003219ddp-1. */
+VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t ax = vabsq_f64 (x);
+
+ uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+ uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
+ uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+ special = vorrq_u64 (special, tiny);
+#else
+ uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
+#endif
+
+ /* Option 1: |x| >= 1.
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+ If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+ overflow, by setting special lanes to 1. These will be fixed later. */
+ float64x2_t option_1 = v_f64 (0);
+ if (likely (v_any_u64 (gt1)))
+ {
+#if WANT_SIMD_EXCEPT
+ float64x2_t xm = v_zerofy_f64 (ax, special);
+#else
+ float64x2_t xm = ax;
+#endif
+ option_1 = log_inline (
+ vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
+ }
+
+ /* Option 2: |x| < 1.
+ Compute asinh(x) using a polynomial.
+ If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+ overflow, and tiny lanes, which will underflow, by setting them to 0. They
+ will be fixed later, either by selecting x or falling back to the scalar
+ special-case. The largest observed error in this region is 1.47 ULPs:
+ _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+ want 0x1.c1d6bf874019cp-1. */
+ float64x2_t option_2 = v_f64 (0);
+
+ if (likely (v_any_u64 (vceqzq_u64 (gt1))))
+ {
+
+#if WANT_SIMD_EXCEPT
+ ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
+#endif
+ float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
+ /* Order-17 Pairwise Horner scheme. */
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
+ float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
+
+ float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
+ p = vfmaq_f64 (p1213, z2, p);
+ p = vfmaq_f64 (p1011, z2, p);
+ p = vfmaq_f64 (p89, z2, p);
+
+ p = vfmaq_f64 (p67, z2, p);
+ p = vfmaq_f64 (p45, z2, p);
+
+ p = vfmaq_f64 (p23, z2, p);
+
+ p = vfmaq_f64 (p01, z2, p);
+ option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
+#if WANT_SIMD_EXCEPT
+ option_2 = vbslq_f64 (tiny, x, option_2);
+#endif
+ }
+
+ /* Choose the right option for each lane. */
+ float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
+ if (unlikely (v_any_u64 (special)))
+ {
+ return special_case (x, y, d->abs_mask, special);
+ }
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
+}
+
+TEST_SIG (V, D, 1, asinh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (asinh), 2.29)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+ Ensures the v_sel is choosing the right option in all cases. */
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5)
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2)
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c
new file mode 100644
index 000000000000..6a96f6ee9f4b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c
@@ -0,0 +1,89 @@
+/*
+ * Single-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ float32x4_t one;
+ uint32x4_t big_bound;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound;
+#endif
+} data = {
+ .one = V4 (1),
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
+#if WANT_SIMD_EXCEPT
+ .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+ uint32x4_t special, const struct data *d)
+{
+ return v_call_f32 (
+ asinhf, x,
+ vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+ special);
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+ Worst-case error is 2.59 ULP:
+ _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+ want 0x1.d449c4p-3. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+ uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
+ float32x4_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+ /* Sidestep tiny and large values to avoid inadvertently triggering
+ under/overflow. */
+ special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
+ if (unlikely (v_any_u32 (special)))
+ {
+ ax = v_zerofy_f32 (ax, special);
+ x = v_zerofy_f32 (x, special);
+ }
+#endif
+
+ /* asinh(x) = log(x + sqrt(x * x + 1)).
+ For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
+ float32x4_t d
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+ float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
+
+ if (unlikely (v_any_u32 (special)))
+ return special_case (special_arg, sign, y, special, dat);
+ return vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
+}
+
+HALF_WIDTH_ALIAS_F1 (asinh)
+
+TEST_SIG (V, F, 1, asinh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (asinh), 2.10)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c
new file mode 100644
index 000000000000..26d264321068
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c
@@ -0,0 +1,135 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+ float64x2_t pi_over_2;
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
+ .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
+ .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
+ .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
+ .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
+ .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
+ .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
+ .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
+ .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */
+#define BigBound 0x4340000000000000 /* asuint64(0x1p53). */
+
+/* Fast implementation of vector atan.
+ Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+ z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+ _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+ want 0x1.9225645bdd7c3p-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
+
+ /* Small cases, infs and nans are supported by our approximation technique,
+ but do not set fenv flags correctly. Only trigger special case if we need
+ fenv. */
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t sign = vandq_u64 (ix, SignMask);
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000));
+ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)),
+ v_u64 (BigBound - TinyBound));
+ /* If any lane is special, fall back to the scalar routine for all lanes. */
+ if (unlikely (v_any_u64 (special)))
+ return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
+#endif
+
+ /* Argument reduction:
+ y := arctan(x) for x < 1
+ y := pi/2 + arctan(-1/x) for x > 1
+ Hence, use z=-1/a if x>=1, otherwise z=a. */
+ uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0));
+ /* Avoid dependency in abs(x) in division (and comparison). */
+ float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x);
+ float64x2_t shift = vreinterpretq_f64_u64 (
+ vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2)));
+ /* Use absolute value only when needed (odd powers of z). */
+ float64x2_t az = vbslq_f64 (
+ SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z);
+
+ /* Calculate the polynomial approximation.
+ Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+ full scheme to avoid underflow in x^16.
+ The order 19 polynomial P approximates
+ (atan(sqrt(x))-sqrt(x))/x^(3/2). */
+ float64x2_t z2 = vmulq_f64 (z, z);
+ float64x2_t x2 = vmulq_f64 (z2, z2);
+ float64x2_t x4 = vmulq_f64 (x2, x2);
+ float64x2_t x8 = vmulq_f64 (x4, x4);
+
+ /* estrin_7. */
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+ /* estrin_11. */
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+ float64x2_t y = vfmaq_f64 (p07, p819, x8);
+
+ /* Finalize. y = shift + z + z^3 * P(z^2). */
+ y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
+ y = vaddq_f64 (y, shift);
+
+ /* y = atan(x) if x>0, -atan(-x) otherwise. */
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign));
+ return y;
+}
+
+TEST_SIG (V, D, 1, atan, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (atan), 1.78)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
+TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
+TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c
new file mode 100644
index 000000000000..18c4b70b92f6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c
@@ -0,0 +1,171 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+ float64x2_t pi_over_2;
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+ uint64x2_t zeroinfnan, minustwo;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .c0 = V2 (-0x1.5555555555555p-2),
+ .c1 = 0x1.99999999996c1p-3,
+ .c2 = V2 (-0x1.2492492478f88p-3),
+ .c3 = 0x1.c71c71bc3951cp-4,
+ .c4 = V2 (-0x1.745d160a7e368p-4),
+ .c5 = 0x1.3b139b6a88ba1p-4,
+ .c6 = V2 (-0x1.11100ee084227p-4),
+ .c7 = 0x1.e1d0f9696f63bp-5,
+ .c8 = V2 (-0x1.aebfe7b418581p-5),
+ .c9 = 0x1.842dbe9b0d916p-5,
+ .c10 = V2 (-0x1.5d30140ae5e99p-5),
+ .c11 = 0x1.338e31eb2fbbcp-5,
+ .c12 = V2 (-0x1.00e6eece7de8p-5),
+ .c13 = 0x1.860897b29e5efp-6,
+ .c14 = V2 (-0x1.0051381722a59p-6),
+ .c15 = 0x1.14e9dc19a4a4ep-7,
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9),
+ .c17 = 0x1.17739e210171ap-10,
+ .c18 = V2 (-0x1.ab24da7be7402p-13),
+ .c19 = 0x1.358851160a528p-16,
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
+ .minustwo = V2 (0xc000000000000000),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
+ uint64x2_t sign_xy, uint64x2_t cmp)
+{
+ /* Account for the sign of x and y. */
+ ret = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+ return v_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline uint64x2_t
+zeroinfnan (uint64x2_t i, const struct data *d)
+{
+ /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
+ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
+}
+
+/* Fast implementation of vector atan2.
+ Maximum observed error is 2.8 ulps:
+ _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+ got 0x1.92d628ab678ccp-1
+ want 0x1.92d628ab678cfp-1. */
+float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t iy = vreinterpretq_u64_f64 (y);
+
+ uint64x2_t special_cases
+ = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+
+ uint64x2_t sign_x = vandq_u64 (ix, SignMask);
+ uint64x2_t sign_y = vandq_u64 (iy, SignMask);
+ uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
+
+ float64x2_t ax = vabsq_f64 (x);
+ float64x2_t ay = vabsq_f64 (y);
+
+ uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+ uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
+
+ /* Set up z for call to atan. */
+ float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+ float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
+ float64x2_t z = vdivq_f64 (n, q);
+
+ /* Work out the correct shift. */
+ float64x2_t shift
+ = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
+ shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
+ shift = vmulq_f64 (shift, d->pi_over_2);
+
+ /* Calculate the polynomial approximation.
+ Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+ full scheme to avoid underflow in x^16.
+ The order 19 polynomial P approximates
+ (atan(sqrt(x))-sqrt(x))/x^(3/2). */
+ float64x2_t z2 = vmulq_f64 (z, z);
+ float64x2_t x2 = vmulq_f64 (z2, z2);
+ float64x2_t x4 = vmulq_f64 (x2, x2);
+ float64x2_t x8 = vmulq_f64 (x4, x4);
+
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
+
+ /* estrin_7. */
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+ /* estrin_11. */
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+ float64x2_t ret = vfmaq_f64 (p07, p819, x8);
+
+ /* Finalize. y = shift + z + z^3 * P(z^2). */
+ ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
+ ret = vaddq_f64 (ret, shift);
+
+ if (unlikely (v_any_u64 (special_cases)))
+ return special_case (y, x, ret, sign_xy, special_cases);
+
+ /* Account for the sign of x and y. */
+ ret = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+
+ return ret;
+}
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
+TEST_SIG (V, D, 2, atan2)
+// TODO tighten this once __v_atan2 is fixed
+TEST_ULP (V_NAME_D2 (atan2), 2.9)
+TEST_DISABLE_FENV (V_NAME_D2 (atan2))
+TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000)
+TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c
new file mode 100644
index 000000000000..632014249ab0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c
@@ -0,0 +1,127 @@
+/*
+ * Single-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t c0, pi_over_2, c4, c6, c2;
+ float c1, c3, c5, c7;
+ uint32x4_t comp_const;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0].
+ Generated using fpminimax between FLT_MIN and 1. */
+ .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
+ .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
+ .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
+ .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
+ .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
+};
+
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
+ uint32x4_t sign_xy, uint32x4_t cmp)
+{
+ /* Account for the sign of y. */
+ ret = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+ return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline uint32x4_t
+zeroinfnan (uint32x4_t i, const struct data *d)
+{
+ /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
+ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+ 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+ _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+ want 0x1.967f00p-1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t iy = vreinterpretq_u32_f32 (y);
+
+ uint32x4_t special_cases
+ = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+
+ uint32x4_t sign_x = vandq_u32 (ix, SignMask);
+ uint32x4_t sign_y = vandq_u32 (iy, SignMask);
+ uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y);
+
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ay = vabsq_f32 (y);
+
+ uint32x4_t pred_xlt0 = vcltzq_f32 (x);
+ uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
+
+ /* Set up z for call to atanf. */
+ float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+ float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
+ float32x4_t z = vdivq_f32 (n, q);
+
+ /* Work out the correct shift. */
+ float32x4_t shift = vreinterpretq_f32_u32 (
+ vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
+ shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
+ shift = vmulq_f32 (shift, d->pi_over_2);
+
+ /* Calculate the polynomial approximation.
+ Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+ a standard implementation using z8 creates spurious underflow
+ in the very last fma (when z^8 is small enough).
+ Therefore, we split the last fma into a mul and an fma.
+ Horner and single-level Estrin have higher errors that exceed
+ threshold. */
+ float32x4_t z2 = vmulq_f32 (z, z);
+ float32x4_t z4 = vmulq_f32 (z2, z2);
+
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
+ float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+ float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
+
+ float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
+
+ /* y = shift + z * P(z^2). */
+ ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
+
+ if (unlikely (v_any_u32 (special_cases)))
+ {
+ return special_case (y, x, ret, sign_xy, special_cases);
+ }
+
+ /* Account for the sign of y. */
+ return vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+}
+
+HALF_WIDTH_ALIAS_F2 (atan2)
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
+TEST_SIG (V, F, 2, atan2)
+TEST_DISABLE_FENV (V_NAME_F2 (atan2))
+TEST_ULP (V_NAME_F2 (atan2), 2.46)
+TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
+TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c
new file mode 100644
index 000000000000..61927c9b261a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c
@@ -0,0 +1,109 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
+
+static const struct data
+{
+ float32x4_t poly[8];
+ float32x4_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0].
+ Generated using fpminimax between FLT_MIN and 1. */
+ .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
+ V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
+ V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
+ .pi_over_2 = V4 (0x1.921fb6p+0f),
+};
+
+#define SignMask v_u32 (0x80000000)
+
+#define P(i) d->poly[i]
+
+#define TinyBound 0x30800000 /* asuint(0x1p-30). */
+#define BigBound 0x4e800000 /* asuint(0x1p30). */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (atanf, x, y, special);
+}
+#endif
+
+/* Fast implementation of vector atanf based on
+ atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+ using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+ _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Small cases, infs and nans are supported by our approximation technique,
+ but do not set fenv flags correctly. Only trigger special case if we need
+ fenv. */
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t sign = vandq_u32 (ix, SignMask);
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
+ uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
+ v_u32 (BigBound - TinyBound));
+ /* If any lane is special, fall back to the scalar routine for all lanes. */
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, x, v_u32 (-1));
+#endif
+
+ /* Argument reduction:
+ y := arctan(x) for x < 1
+ y := pi/2 + arctan(-1/x) for x > 1
+ Hence, use z=-1/a if x>=1, otherwise z=a. */
+ uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0));
+ /* Avoid dependency in abs(x) in division (and comparison). */
+ float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x);
+ float32x4_t shift = vreinterpretq_f32_u32 (
+ vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2)));
+ /* Use absolute value only when needed (odd powers of z). */
+ float32x4_t az = vbslq_f32 (
+ SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z);
+
+ /* Calculate the polynomial approximation.
+ Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+ a standard implementation using z8 creates spurious underflow
+ in the very last fma (when z^8 is small enough).
+ Therefore, we split the last fma into a mul and an fma.
+ Horner and single-level Estrin have higher errors that exceed
+ threshold. */
+ float32x4_t z2 = vmulq_f32 (z, z);
+ float32x4_t z4 = vmulq_f32 (z2, z2);
+
+ float32x4_t y = vfmaq_f32 (
+ v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
+ vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
+
+ /* y = shift + z * P(z^2). */
+ y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift);
+
+ /* y = atan(x) if x>0, -atan(-x) otherwise. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign));
+
+ return y;
+}
+
+HALF_WIDTH_ALIAS_F1 (atan)
+
+TEST_SIG (V, F, 1, atan, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (atan), 2.5)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c
new file mode 100644
index 000000000000..c2f9585dd29b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c
@@ -0,0 +1,75 @@
+/*
+ * Double-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+ struct v_log1p_data log1p_consts;
+ uint64x2_t one;
+ uint64x2_t sign_mask;
+} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+ .one = V2 (0x3ff0000000000000),
+ .sign_mask = V2 (0x8000000000000000) };
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
+ uint64x2_t special, const struct data *d)
+{
+ y = log1p_inline (y, &d->log1p_consts);
+ return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
+ vmulq_f64 (halfsign, y), special);
+}
+
+/* Approximation for vector double-precision atanh(x) using modified log1p.
+ The greatest observed error is 3.31 ULP:
+ _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+ want 0x1.ffd8ff31b501cp-6. */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t ia = vreinterpretq_u64_f64 (ax);
+ uint64x2_t special = vcgeq_u64 (ia, d->one);
+
+#if WANT_SIMD_EXCEPT
+ ax = v_zerofy_f64 (ax, special);
+#endif
+
+ float64x2_t y;
+ y = vaddq_f64 (ax, ax);
+ y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
+
+ if (unlikely (v_any_u64 (special)))
+#if WANT_SIMD_EXCEPT
+ return special_case (x, halfsign, y, special, d);
+#else
+ return special_case (ax, halfsign, y, special, d);
+#endif
+
+ y = log1p_inline (y, &d->log1p_consts);
+ return vmulq_f64 (y, halfsign);
+}
+
+TEST_SIG (V, D, 1, atanh, -1.0, 1.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
+TEST_ULP (V_NAME_D1 (atanh), 3.32)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100)
+/* atanh is asymptotic at 1, which is the default control value - have to set
+ -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+ control lane is irrelevant if fp exceptions are disabled). */
+TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c
new file mode 100644
index 000000000000..313d15ca6391
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c
@@ -0,0 +1,90 @@
+/*
+ * Single-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ uint32x4_t one;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound;
+#endif
+} data = {
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .one = V4 (0x3f800000),
+#if WANT_SIMD_EXCEPT
+ /* 0x1p-12, below which atanhf(x) rounds to x. */
+ .tiny_bound = V4 (0x39800000),
+#endif
+};
+
+#define AbsMask v_u32 (0x7fffffff)
+#define Half v_u32 (0x3f000000)
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+ uint32x4_t special)
+{
+ return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+ vmulq_f32 (halfsign, y), special);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+ The maximum error is 2.93 ULP:
+ _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+ want 0x1.f4dcf8p-5. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t special
+ = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound));
+ /* Side-step special cases by setting those lanes to 0, which will trigger no
+ exceptions. These will be fixed up later. */
+ if (unlikely (v_any_u32 (special)))
+ ax = v_zerofy_f32 (ax, special);
+#else
+ uint32x4_t special = vcgeq_u32 (iax, d->one);
+#endif
+
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+ vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+ y = log1pf_inline (y, &d->log1pf_consts);
+
+ /* If exceptions not required, pass ax to special-case for shorter dependency
+ chain. If exceptions are required ax will have been zerofied, so have to
+ pass x. */
+ if (unlikely (v_any_u32 (special)))
+#if WANT_SIMD_EXCEPT
+ return special_case (x, halfsign, y, special);
+#else
+ return special_case (ax, halfsign, y, special);
+#endif
+ return vmulq_f32 (halfsign, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (atanh)
+
+TEST_SIG (V, F, 1, atanh, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (atanh), 2.44)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000)
+/* atanh is asymptotic at 1, which is the default control value - have to set
+ -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+ control lane is irrelevant if fp exceptions are disabled). */
+TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c
new file mode 100644
index 000000000000..8e72e5b566fc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c
@@ -0,0 +1,127 @@
+/*
+ * Double-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f64.h"
+
+const static struct data
+{
+ float64x2_t poly[4], one_third, shift;
+ int64x2_t exp_bias;
+ uint64x2_t abs_mask, tiny_bound;
+ uint32x4_t thresh;
+ double table[5];
+} data = {
+ .shift = V2 (0x1.8p52),
+ .poly = { /* Generated with fpminimax in [0.5, 1]. */
+ V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1),
+ V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) },
+ .exp_bias = V2 (1022),
+ .abs_mask = V2(0x7fffffffffffffff),
+ .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */
+ .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */
+ .one_third = V2(0x1.5555555555555p-2),
+ .table = { /* table[i] = 2^((i - 2) / 3). */
+ 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
+ 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 }
+};
+
+#define MantissaMask v_u64 (0x000fffffffffffff)
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
+{
+ return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order
+ polynomial and two Newton iterations.
+
+ The vector version of frexp does not handle subnormals
+ correctly. As a result these need to be handled by the scalar
+ fallback, where accuracy may be worse than that of the vector code
+ path.
+
+ Greatest observed error in the normal range is 1.79 ULP. Errors repeat
+ according to the exponent, for instance an error observed for double value
+ m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+ integer.
+ _ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+ want 0x1.965fe72821e99p+0. */
+VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+ /* Subnormal, +/-0 and special values. */
+ uint32x2_t special
+ = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh));
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexp, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5));
+ int64x2_t exp_bias = d->exp_bias;
+ uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
+ int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
+
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
+ for Newton iterations. */
+ float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
+ float64x2_t one_third = d->one_third;
+ /* Two iterations of Newton's method for iteratively approximating cbrt. */
+ float64x2_t m_by_3 = vmulq_f64 (m, one_third);
+ float64x2_t two_thirds = vaddq_f64 (one_third, one_third);
+ float64x2_t a
+ = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p);
+ a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+
+ float64x2_t ef = vcvtq_f64_s64 (e);
+ float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third));
+ int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3)));
+ int64x2_t ey = vcvtq_s64_f64 (eb3f);
+
+ float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] };
+ my = vmulq_f64 (my, a);
+
+ /* Vector version of ldexp. */
+ float64x2_t y = vreinterpretq_f64_s64 (
+ vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52));
+ y = vmulq_f64 (y, my);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (x, vbslq_f64 (d->abs_mask, y, x), special);
+
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
+}
+
+/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which
+ has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error
+ in the vector path is 1.79 ULP.
+ [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical
+ Functions in Single, Double, Double Extended, and Quadruple Precision. */
+TEST_ULP (V_NAME_D1 (cbrt), 3.17)
+TEST_SIG (V, D, 1, cbrt, -10.0, 10.0)
+TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c
new file mode 100644
index 000000000000..4e76feb2dd8b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c
@@ -0,0 +1,117 @@
+/*
+ * Single-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
+
+const static struct data
+{
+ float32x4_t poly[4], one_third;
+ float table[5];
+} data = {
+ .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with
+ FPMinimax. */
+ V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1),
+ V4 (0x1.2c74c2p-3) },
+ .table = { /* table[i] = 2^((i - 2) / 3). */
+ 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
+ .one_third = V4 (0x1.555556p-2f),
+};
+
+#define SignMask v_u32 (0x80000000)
+#define SmallestNormal v_u32 (0x00800000)
+#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */
+#define MantissaMask v_u32 (0x007fffff)
+#define HalfExp v_u32 (0x3f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special)
+{
+ return v_call_f32 (cbrtf, x, y, vmovl_u16 (special));
+}
+
+static inline float32x4_t
+shifted_lookup (const float *table, int32x4_t i)
+{
+ return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2],
+ table[i[3] + 2] };
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
+ with initial guess obtained by a low-order polynomial. Greatest error
+ is 1.64 ULP. This is observed for every value where the mantissa is
+ 0x1.85a2aa and the exponent is a multiple of 3, for example:
+ _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
+ want 0x1.267932p+1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+ /* Subnormal, +/-0 and special values. */
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh);
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexpf, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5));
+ int32x4_t e
+ = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126));
+
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+ the less accurate the next stage of the algorithm needs to be. An order-4
+ polynomial is enough for one Newton iteration. */
+ float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly);
+
+ float32x4_t one_third = d->one_third;
+ float32x4_t two_thirds = vaddq_f32 (one_third, one_third);
+
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
+ float32x4_t m_by_3 = vmulq_f32 (m, one_third);
+ float32x4_t a
+ = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+ float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third);
+ int32x4_t ey = vcvtq_s32_f32 (ef);
+ int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3)));
+
+ float32x4_t my = shifted_lookup (d->table, em3);
+ my = vmulq_f32 (my, a);
+
+ /* Vector version of ldexpf. */
+ float32x4_t y
+ = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23));
+ y = vmulq_f32 (y, my);
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, vbslq_f32 (SignMask, x, y), special);
+
+ /* Copy sign. */
+ return vbslq_f32 (SignMask, x, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (cbrt)
+
+TEST_SIG (V, F, 1, cbrt, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (cbrt), 1.15)
+TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c
new file mode 100644
index 000000000000..40ba5ff31f20
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c
@@ -0,0 +1,47 @@
+/*
+ * Double-precision vector sincos function - return-by-value interface.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincos_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+static float64x2x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y)
+{
+ return (float64x2x2_t){ v_call_f64 (sin, x, y.val[0], special),
+ v_call_f64 (cos, x, y.val[1], special) };
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+VPCS_ATTR float64x2x2_t
+_ZGVnN2v_cexpi (float64x2_t x)
+{
+ const struct v_sincos_data *d = ptr_barrier (&v_sincos_data);
+ uint64x2_t special = check_ge_rangeval (x, d);
+
+ float64x2x2_t sc = v_sincos_inline (x, d);
+
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, special, sc);
+ return sc;
+}
+
+TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin)
+TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
+TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
+#define V_CEXPI_INTERVAL(lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
+V_CEXPI_INTERVAL (0, 0x1p23, 500000)
+V_CEXPI_INTERVAL (-0, -0x1p23, 500000)
+V_CEXPI_INTERVAL (0x1p23, inf, 10000)
+V_CEXPI_INTERVAL (-0x1p23, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c
new file mode 100644
index 000000000000..e55d99653a66
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c
@@ -0,0 +1,49 @@
+/*
+ * Single-precision vector cexpi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincosf_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+static float32x4x2_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y)
+{
+ return (float32x4x2_t){ v_call_f32 (sinf, x, y.val[0], special),
+ v_call_f32 (cosf, x, y.val[1], special) };
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+VPCS_ATTR float32x4x2_t
+_ZGVnN4v_cexpif (float32x4_t x)
+{
+ const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data);
+ uint32x4_t special = check_ge_rangeval (x, d);
+
+ float32x4x2_t sc = v_sincosf_inline (x, d);
+
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, special, sc);
+ return sc;
+}
+
+TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos)
+TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
+TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
+#define V_CEXPIF_INTERVAL(lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
+V_CEXPIF_INTERVAL (0, 0x1p20, 500000)
+V_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
+V_CEXPIF_INTERVAL (0x1p20, inf, 10000)
+V_CEXPIF_INTERVAL (-0x1p20, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c
new file mode 100644
index 000000000000..9f3de4dd5c36
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c
@@ -0,0 +1,92 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ float64x2_t poly[7];
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
+} data = {
+ /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
+ .pi_1 = V2 (0x1.921fb54442d18p+1),
+ .pi_2 = V2 (0x1.1a62633145c06p-53),
+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
+ .range_val = V2 (0x1p23)
+};
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (cos, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
+ uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f64 (x);
+ cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
+ vreinterpretq_u64_f64 (d->range_val));
+ if (unlikely (v_any_u64 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f64 (cmp, v_f64 (1.0), r);
+#else
+ cmp = vcageq_f64 (x, d->range_val);
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+ n = vsubq_f64 (n, v_f64 (0.5f));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (r, d->pi_2, n);
+ r = vfmsq_f64 (r, d->pi_3, n);
+
+ /* sin(r) poly approx. */
+ r2 = vmulq_f64 (r, r);
+ r3 = vmulq_f64 (r2, r);
+ r4 = vmulq_f64 (r2, r2);
+
+ t1 = vfmaq_f64 (C (4), C (5), r2);
+ t2 = vfmaq_f64 (C (2), C (3), r2);
+ t3 = vfmaq_f64 (C (0), C (1), r2);
+
+ y = vfmaq_f64 (t1, C (6), r4);
+ y = vfmaq_f64 (t2, y, r4);
+ y = vfmaq_f64 (t3, y, r4);
+ y = vfmaq_f64 (r, y, r3);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+TEST_SIG (V, D, 1, cos, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (cos), 3.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c
new file mode 100644
index 000000000000..d2844e44e196
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c
@@ -0,0 +1,89 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ float32x4_t poly[4];
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
+} data = {
+ /* 1.886 ulp error. */
+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+ V4 (0x1.5b2e76p-19f) },
+
+ .pi_1 = V4 (0x1.921fb6p+1f),
+ .pi_2 = V4 (-0x1.777a5cp-24f),
+ .pi_3 = V4 (-0x1.ee59dap-49f),
+
+ .inv_pi = V4 (0x1.45f306p-2f),
+ .range_val = V4 (0x1p20f)
+};
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (cosf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, r3, y;
+ uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f32 (x);
+ cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
+ vreinterpretq_u32_f32 (d->range_val));
+ if (unlikely (v_any_u32 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f32 (cmp, v_f32 (1.0f), r);
+#else
+ cmp = vcageq_f32 (x, d->range_val);
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
+ n = vsubq_f32 (n, v_f32 (0.5f));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (r, d->pi_2, n);
+ r = vfmsq_f32 (r, d->pi_3, n);
+
+ /* y = sin(r). */
+ r2 = vmulq_f32 (r, r);
+ r3 = vmulq_f32 (r2, r);
+ y = vfmaq_f32 (C (2), C (3), r2);
+ y = vfmaq_f32 (C (1), y, r2);
+ y = vfmaq_f32 (C (0), y, r2);
+ y = vfmaq_f32 (r, y, r3);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+HALF_WIDTH_ALIAS_F1 (cos)
+
+TEST_SIG (V, F, 1, cos, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (cos), 1.4)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c
new file mode 100644
index 000000000000..54407b23aa9d
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c
@@ -0,0 +1,107 @@
+/*
+ * Double-precision vector cosh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t poly[3];
+ float64x2_t inv_ln2;
+ double ln2[2];
+ float64x2_t shift, thres;
+ uint64x2_t index_mask, special_bound;
+} data = {
+ .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
+ V2 (0x1.5555576a59599p-5), },
+
+ .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */
+ /* -ln2/N. */
+ .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
+ .shift = V2 (0x1.8p+52),
+ .thres = V2 (704.0),
+
+ .index_mask = V2 (0xff),
+ /* 0x1.6p9, above which exp overflows. */
+ .special_bound = V2 (0x4086000000000000),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from v_exp_tail, with no
+ special-case handling or tail. */
+static inline float64x2_t
+exp_inline (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* n = round(x/(ln2/N)). */
+ float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ /* r = x - n*ln2/N. */
+ float64x2_t ln2 = vld1q_f64 (d->ln2);
+ float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
+ r = vfmaq_laneq_f64 (r, n, ln2, 1);
+
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
+ uint64x2_t i = vandq_u64 (u, d->index_mask);
+
+ /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
+ float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
+ y = vfmaq_f64 (d->poly[0], y, r);
+ y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
+
+ /* s = 2^(n/N). */
+ u = v_lookup_u64 (__v_exp_tail_data, i);
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ return vfmaq_f64 (s, y, s);
+}
+
+/* Approximation for vector double-precision cosh(x) using exp_inline.
+ cosh(x) = (exp(x) + exp(-x)) / 2.
+ The greatest observed error is in the scalar fall-back region, so is the
+ same as the scalar routine, 1.93 ULP:
+ _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+ want 0x1.fdf28623ef923p+1021.
+
+ The greatest observed error in the non-special region is 1.54 ULP:
+ _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+ want 0x1.f711dcb0c77b1p+7. */
+float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t special
+ = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
+
+ /* Up to the point that exp overflows, we can use it to calculate cosh by
+ exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
+ float64x2_t t = exp_inline (ax);
+ float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+ float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+
+ /* Fall back to scalar for any special cases. */
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, vaddq_f64 (half_t, half_over_t), special);
+
+ return vaddq_f64 (half_t, half_over_t);
+}
+
+TEST_SIG (V, D, 1, cosh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (cosh), 1.43)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c
new file mode 100644
index 000000000000..f1ed3e5161fd
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c
@@ -0,0 +1,92 @@
+/*
+ * Single-precision vector cosh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_expf_inline.h"
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ struct v_expf_data expf_consts;
+ uint32x4_t tiny_bound;
+ float32x4_t bound;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t special_bound;
+#endif
+} data = {
+ .expf_consts = V_EXPF_DATA,
+ .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .bound = V4 (0x1.5a92d8p+6),
+#if WANT_SIMD_EXCEPT
+ .special_bound = V4 (0x42ad496c),
+#endif
+};
+
+#if !WANT_SIMD_EXCEPT
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
+ uint32x4_t special)
+{
+ return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
+}
+#endif
+
+/* Single-precision vector cosh, using vector expf.
+ Maximum error is 2.38 ULP:
+ _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
+ want 0x1.6a4922p+4. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ /* If fp exceptions are to be triggered correctly, fall back to the scalar
+ variant for all inputs if any input is a special value or above the bound
+ at which expf overflows. */
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+ if (unlikely (v_any_u32 (special)))
+ return v_call_f32 (coshf, x, x, v_u32 (-1));
+
+ uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
+ /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
+ input to 0, which will generate no exceptions. */
+ if (unlikely (v_any_u32 (tiny)))
+ ax = v_zerofy_f32 (ax, tiny);
+ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+#else
+ uint32x4_t special = vcageq_f32 (x, d->bound);
+ float32x4_t t = v_expf_inline (x, &d->expf_consts);
+#endif
+
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
+ float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+ float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u32 (tiny)))
+ return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
+#else
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, half_t, half_over_t, special);
+#endif
+
+ return vaddq_f32 (half_t, half_over_t);
+}
+
+HALF_WIDTH_ALIAS_F1 (cosh)
+
+TEST_SIG (V, F, 1, cosh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (cosh), 1.89)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c
new file mode 100644
index 000000000000..e63201a55786
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cospi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t poly[10];
+ float64x2_t range_val;
+} data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+ V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+ V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+ V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+ V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+ .range_val = V2 (0x1p63),
+};
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (arm_math_cospi, x, y, cmp);
+}
+
+/* Approximation for vector double-precision cospi(x).
+ Maximum Error 3.06 ULP:
+ _ZGVnN2v_cospi(0x1.7dd4c0b03cc66p-5) got 0x1.fa854babfb6bep-1
+ want 0x1.fa854babfb6c1p-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ float64x2_t r = vabsq_f64 (x);
+ uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x);
+
+ /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
+ to avoid them overflowing and throwing exceptions. */
+ r = v_zerofy_f64 (r, cmp);
+ uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63);
+
+#else
+ float64x2_t r = x;
+ uint64x2_t cmp = vcageq_f64 (r, d->range_val);
+ uint64x2_t odd
+ = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
+
+#endif
+
+ r = vsubq_f64 (r, vrndaq_f64 (r));
+
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r));
+
+ /* y = sin(r). */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t r4 = vmulq_f64 (r2, r2);
+ float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r);
+
+ /* Fallback to scalar. */
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+
+ /* Reintroduce the sign bit for inputs which round to odd. */
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_D1 (cospi), 2.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c
new file mode 100644
index 000000000000..62f4b8122b2c
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c
@@ -0,0 +1,86 @@
+/*
+ * Single-precision vector cospi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t poly[6];
+ float32x4_t range_val;
+} data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+ V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+ .range_val = V4 (0x1p31f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (arm_math_cospif, x, y, cmp);
+}
+
+/* Approximation for vector single-precision cospi(x)
+ Maximum Error: 3.17 ULP:
+ _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
+ want 0x1.f7cd5p-1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ float32x4_t r = vabsq_f32 (x);
+ uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x);
+
+ /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
+ to avoid them overflowing and throwing exceptions. */
+ r = v_zerofy_f32 (r, cmp);
+ uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31);
+
+#else
+ float32x4_t r = x;
+ uint32x4_t cmp = vcageq_f32 (r, d->range_val);
+
+ uint32x4_t odd
+ = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
+
+#endif
+
+ /* r = x - rint(x). */
+ r = vsubq_f32 (r, vrndaq_f32 (r));
+
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r));
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t r4 = vmulq_f32 (r2, r2);
+ float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r);
+
+ /* Fallback to scalar. */
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+
+ /* Reintroduce the sign bit for inputs which round to odd. */
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+HALF_WIDTH_ALIAS_F1 (cospi)
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_F1 (cospi), 2.67)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c
new file mode 100644
index 000000000000..40717a660ce2
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c
@@ -0,0 +1,166 @@
+/*
+ * Double-precision vector erf(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t third;
+ float64x2_t tenth, two_over_five, two_over_nine;
+ double two_over_fifteen, two_over_fortyfive;
+ float64x2_t max, shift;
+ uint64x2_t max_idx;
+#if WANT_SIMD_EXCEPT
+ float64x2_t tiny_bound, huge_bound, scale_minus_one;
+#endif
+} data = {
+ .max_idx = V2 (768),
+ .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
+ .two_over_fifteen = 0x1.1111111111111p-3,
+ .tenth = V2 (-0x1.999999999999ap-4),
+ .two_over_five = V2 (-0x1.999999999999ap-2),
+ .two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
+ .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
+ .max = V2 (5.9921875), /* 6 - 1/128. */
+ .shift = V2 (0x1p45),
+#if WANT_SIMD_EXCEPT
+ .huge_bound = V2 (0x1p205),
+ .tiny_bound = V2 (0x1p-226),
+ .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0. */
+#endif
+};
+
+#define AbsMask 0x7fffffffffffffff
+
+struct entry
+{
+ float64x2_t erf;
+ float64x2_t scale;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ struct entry e;
+ float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+ e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
+ e.erf = vuzp1q_f64 (e1, e2);
+ e.scale = vuzp2q_f64 (e1, e2);
+ return e;
+}
+
+/* Double-precision implementation of vector erf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erf(x) ~ erf(r) + scale * d * [
+ + 1
+ - r d
+ + 1/3 (2 r^2 - 1) d^2
+ - 1/6 (r (2 r^2 - 3)) d^3
+ + 1/30 (4 r^4 - 12 r^2 + 3) d^4
+ - 1/90 (4 r^4 - 20 r^2 + 15) d^5
+ ]
+
+ Maximum measure error: 2.29 ULP
+ V_NAME_D1 (erf)(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8
+ want -0x1.20dd59132ebafp-8. */
+float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+ float64x2_t a = vabsq_f64 (x);
+ /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
+ to return expected results. */
+ uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
+ uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
+
+#if WANT_SIMD_EXCEPT
+ /* |x| huge or tiny. */
+ uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound);
+ uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound);
+ uint64x2_t cmp = vorrq_u64 (cmp1, cmp2);
+ /* If any lanes are special, mask them with 1 for small x or 8 for large
+ values and retain a copy of a to allow special case handler to fix special
+ lanes later. This is only necessary if fenv exceptions are to be triggered
+ correctly. */
+ if (unlikely (v_any_u64 (cmp)))
+ {
+ a = vbslq_f64 (cmp1, v_f64 (8.0), a);
+ a = vbslq_f64 (cmp2, v_f64 (1.0), a);
+ }
+#endif
+
+ /* Set r to multiple of 1/128 nearest to |x|. */
+ float64x2_t shift = dat->shift;
+ float64x2_t z = vaddq_f64 (a, shift);
+
+ /* Lookup erf(r) and scale(r) in table, without shortcut for small values,
+ but with saturated indices for large values and NaNs in order to avoid
+ segfault. */
+ uint64x2_t i
+ = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
+ i = vbslq_u64 (a_le_max, i, dat->max_idx);
+ struct entry e = lookup (i);
+
+ float64x2_t r = vsubq_f64 (z, shift);
+
+ /* erf(x) ~ erf(r) + scale * d * poly (r, d). */
+ float64x2_t d = vsubq_f64 (a, r);
+ float64x2_t d2 = vmulq_f64 (d, d);
+ float64x2_t r2 = vmulq_f64 (r, r);
+
+ float64x2_t two_over_fifteen_and_fortyfive
+ = vld1q_f64 (&dat->two_over_fifteen);
+
+ /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
+ float64x2_t p1 = r;
+ float64x2_t p2
+ = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
+ float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
+ two_over_fifteen_and_fortyfive, 0);
+ p4 = vfmsq_f64 (dat->tenth, r2, p4);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
+ two_over_fifteen_and_fortyfive, 1);
+ p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
+
+ float64x2_t p34 = vfmaq_f64 (p3, d, p4);
+ float64x2_t p12 = vfmaq_f64 (p1, d, p2);
+ float64x2_t y = vfmaq_f64 (p34, d2, p5);
+ y = vfmaq_f64 (p12, d2, y);
+
+ y = vfmaq_f64 (e.erf, e.scale, vfmsq_f64 (d, d2, y));
+
+ /* Solves the |x| = inf and NaN cases. */
+ y = vbslq_f64 (a_gt_max, v_f64 (1.0), y);
+
+ /* Copy sign. */
+ y = vbslq_f64 (v_u64 (AbsMask), y, x);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u64 (cmp2)))
+ {
+ /* Neutralise huge values of x before fixing small values. */
+ x = vbslq_f64 (cmp1, v_f64 (1.0), x);
+ /* Fix tiny values that trigger spurious underflow. */
+ return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y);
+ }
+#endif
+ return y;
+}
+
+TEST_SIG (V, D, 1, erf, -6.0, 6.0)
+TEST_ULP (V_NAME_D1 (erf), 1.79)
+/* WANT_SIMD_EXCEPT blocks miss some cases. */
+TEST_DISABLE_FENV (V_NAME_D1 (erf))
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c
new file mode 100644
index 000000000000..97ef09ecc113
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c
@@ -0,0 +1,205 @@
+/*
+ * Double-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ uint64x2_t offset, table_scale;
+ float64x2_t max, shift;
+ float64x2_t p20, p40, p41, p51;
+ double p42, p52;
+ double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
+#if WANT_SIMD_EXCEPT
+ float64x2_t uflow_bound;
+#endif
+} data = {
+ /* Set an offset so the range of the index used for lookup is 3487, and it
+ can be clamped using a saturated add on an offset index.
+ Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */
+ .offset = V2 (0xbd3ffffffffff260),
+ .table_scale = V2 (0x37f0000000000000 << 1), /* asuint64 (2^-128) << 1. */
+ .max = V2 (0x1.b3ep+4), /* 3487/128. */
+ .shift = V2 (0x1p45),
+ .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
+ .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
+ .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
+ .p42 = 0x1.1111111111111p-3, /* 2/15. */
+ .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
+ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
+ /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
+ .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
+ .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
+ .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 },
+ .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 },
+ .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 },
+#if WANT_SIMD_EXCEPT
+ .uflow_bound = V2 (0x1.a8b12fc6e4892p+4),
+#endif
+};
+
+#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */
+#define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */
+
+struct entry
+{
+ float64x2_t erfc;
+ float64x2_t scale;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ struct entry e;
+ float64x2_t e1
+ = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+ float64x2_t e2
+ = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
+ e.erfc = vuzp1q_f64 (e1, e2);
+ e.scale = vuzp2q_f64 (e1, e2);
+ return e;
+}
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+ return v_call_f64 (erfc, x, y, cmp);
+}
+#endif
+
+/* Optimized double-precision vector erfc(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+ poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ + (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+ - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5
+ + p6(r) d^6 + ... + p10(r) d^10
+
+ Polynomials p6(r) to p10(r) are computed using recurrence relation
+
+ 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0,
+ with p0 = 1, and p1(r) = -r.
+
+ Values of erfc(r) and scale are read from lookup tables. Stored values
+ are scaled to avoid hitting the subnormal range.
+
+ Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+
+ Maximum measured error: 1.71 ULP
+ V_NAME_D1 (erfc)(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608
+ want 0x1.e15fcbea3e7adp-608. */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ /* |x| < 2^-511. Avoid fabs by left-shifting by 1. */
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound));
+ /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in
+ integer domain to avoid raising exceptions in presence of nans. */
+ uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x),
+ vreinterpretq_s64_f64 (dat->uflow_bound));
+ cmp = vorrq_u64 (cmp, uflow);
+ float64x2_t xm = x;
+ /* If any lanes are special, mask them with 0 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u64 (cmp)))
+ x = v_zerofy_f64 (x, cmp);
+#endif
+
+ float64x2_t a = vabsq_f64 (x);
+ a = vminq_f64 (a, dat->max);
+
+ /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to
+ 2/sqrt(pi), when x reduced to r = 0. */
+ float64x2_t shift = dat->shift;
+ float64x2_t z = vaddq_f64 (a, shift);
+
+ /* Clamp index to a range of 3487. A naive approach would use a subtract and
+ min. Instead we offset the table address and the index, then use a
+ saturating add. */
+ uint64x2_t i = vqaddq_u64 (vreinterpretq_u64_f64 (z), dat->offset);
+
+ struct entry e = lookup (i);
+
+ /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
+ float64x2_t r = vsubq_f64 (z, shift);
+ float64x2_t d = vsubq_f64 (a, r);
+ float64x2_t d2 = vmulq_f64 (d, d);
+ float64x2_t r2 = vmulq_f64 (r, r);
+
+ float64x2_t p1 = r;
+ float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
+ float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
+ float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
+ p4 = vfmsq_f64 (dat->p40, r2, p4);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
+ p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
+ /* Compute p_i using recurrence relation:
+ p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
+ float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
+ qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
+ qr9 = vld1q_f64 (dat->qr9);
+ float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
+ p6 = vmulq_laneq_f64 (p6, qr5, 1);
+ float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
+ p7 = vmulq_laneq_f64 (p7, qr6, 1);
+ float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
+ p8 = vmulq_laneq_f64 (p8, qr7, 1);
+ float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
+ p9 = vmulq_laneq_f64 (p9, qr8, 1);
+ float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
+ p10 = vmulq_laneq_f64 (p10, qr9, 1);
+ /* Compute polynomial in d using pairwise Horner scheme. */
+ float64x2_t p90 = vfmaq_f64 (p9, d, p10);
+ float64x2_t p78 = vfmaq_f64 (p7, d, p8);
+ float64x2_t p56 = vfmaq_f64 (p5, d, p6);
+ float64x2_t p34 = vfmaq_f64 (p3, d, p4);
+ float64x2_t p12 = vfmaq_f64 (p1, d, p2);
+ float64x2_t y = vfmaq_f64 (p78, d2, p90);
+ y = vfmaq_f64 (p56, d2, y);
+ y = vfmaq_f64 (p34, d2, y);
+ y = vfmaq_f64 (p12, d2, y);
+
+ y = vfmsq_f64 (e.erfc, e.scale, vfmsq_f64 (d, d2, y));
+
+ /* Offset equals 2.0 if sign, else 0.0. */
+ uint64x2_t sign = vshrq_n_u64 (vreinterpretq_u64_f64 (x), 63);
+ float64x2_t off = vreinterpretq_f64_u64 (vshlq_n_u64 (sign, 62));
+ /* Copy sign and scale back in a single fma. Since the bit patterns do not
+ overlap, then logical or and addition are equivalent here. */
+ float64x2_t fac = vreinterpretq_f64_u64 (
+ vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1));
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (xm, vfmaq_f64 (off, fac, y), cmp);
+#endif
+
+ return vfmaq_f64 (off, fac, y);
+}
+
+TEST_SIG (V, D, 1, erfc, -6.0, 28.0)
+TEST_ULP (V_NAME_D1 (erfc), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c
new file mode 100644
index 000000000000..f420439ef8a3
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c
@@ -0,0 +1,174 @@
+/*
+ * Single-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ uint32x4_t offset, table_scale;
+ float32x4_t max, shift;
+ float coeffs[4];
+ float32x4_t third, two_over_five, tenth;
+#if WANT_SIMD_EXCEPT
+ float32x4_t uflow_bound;
+#endif
+
+} data = {
+ /* Set an offset so the range of the index used for lookup is 644, and it can
+ be clamped using a saturated add. */
+ .offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */
+ .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */
+ .max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */
+ .shift = V4 (0x1p17f),
+ /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
+ fmas. */
+ .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+ .third = V4 (0x1.555556p-2f),
+ .two_over_five = V4 (-0x1.99999ap-2f),
+ .tenth = V4 (-0x1.99999ap-4f),
+#if WANT_SIMD_EXCEPT
+ .uflow_bound = V4 (0x1.2639cp+3f),
+#endif
+};
+
+#define TinyBound 0x41000000 /* 0x1p-62f << 1. */
+#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */
+#define Off 0xfffffd7b /* 0xffffffff - 644. */
+
+struct entry
+{
+ float32x4_t erfc;
+ float32x4_t scale;
+};
+
+static inline struct entry
+lookup (uint32x4_t i)
+{
+ struct entry e;
+ float32x2_t t0
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+ float32x2_t t1
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+ float32x2_t t2
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+ float32x2_t t3
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+ float32x4_t e1 = vcombine_f32 (t0, t1);
+ float32x4_t e2 = vcombine_f32 (t2, t3);
+ e.erfc = vuzp1q_f32 (e1, e2);
+ e.scale = vuzp2q_f32 (e1, e2);
+ return e;
+}
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ return v_call_f32 (erfcf, x, y, cmp);
+}
+#endif
+
+/* Optimized single-precision vector erfcf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/64.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+ poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ + (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+
+ Values of erfc(r) and scale are read from lookup tables. Stored values
+ are scaled to avoid hitting the subnormal range.
+
+ Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+ Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
+ _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
+ want 0x1.f51216p-120. */
+NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound));
+ /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in
+ integer domain to avoid raising exceptions in presence of nans. */
+ uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x),
+ vreinterpretq_s32_f32 (dat->uflow_bound));
+ cmp = vorrq_u32 (cmp, uflow);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 0 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = v_zerofy_f32 (x, cmp);
+#endif
+
+ float32x4_t a = vabsq_f32 (x);
+ a = vminq_f32 (a, dat->max);
+
+ /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to
+ 2/sqrt(pi), when x reduced to r = 0. */
+ float32x4_t shift = dat->shift;
+ float32x4_t z = vaddq_f32 (a, shift);
+
+ /* Clamp index to a range of 644. A naive approach would use a subtract and
+ min. Instead we offset the table address and the index, then use a
+ saturating add. */
+ uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset);
+
+ struct entry e = lookup (i);
+
+ /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
+ float32x4_t r = vsubq_f32 (z, shift);
+ float32x4_t d = vsubq_f32 (a, r);
+ float32x4_t d2 = vmulq_f32 (d, d);
+ float32x4_t r2 = vmulq_f32 (r, r);
+
+ float32x4_t p1 = r;
+ float32x4_t coeffs = vld1q_f32 (dat->coeffs);
+ float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
+ float32x4_t p3
+ = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
+ float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
+ p4 = vfmsq_f32 (dat->tenth, r2, p4);
+
+ float32x4_t y = vfmaq_f32 (p3, d, p4);
+ y = vfmaq_f32 (p2, d, y);
+ y = vfmaq_f32 (p1, d, y);
+ y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y));
+
+ /* Offset equals 2.0f if sign, else 0.0f. */
+ uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31);
+ float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30));
+ /* Copy sign and scale back in a single fma. Since the bit patterns do not
+ overlap, then logical or and addition are equivalent here. */
+ float32x4_t fac = vreinterpretq_f32_u32 (
+ vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1));
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (xm, vfmaq_f32 (off, fac, y), cmp);
+#endif
+
+ return vfmaq_f32 (off, fac, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (erfc)
+
+TEST_SIG (V, F, 1, erfc, -4.0, 10.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT)
+TEST_ULP (V_NAME_F1 (erfc), 1.14)
+TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c
new file mode 100644
index 000000000000..508bc4c2f5e2
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c
@@ -0,0 +1,120 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t max, shift, third;
+#if WANT_SIMD_EXCEPT
+ float32x4_t tiny_bound, scale_minus_one;
+#endif
+} data = {
+ .max = V4 (3.9375), /* 4 - 8/128. */
+ .shift = V4 (0x1p16f),
+ .third = V4 (0x1.555556p-2f), /* 1/3. */
+#if WANT_SIMD_EXCEPT
+ .tiny_bound = V4 (0x1p-62f),
+ .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */
+#endif
+};
+
+#define AbsMask 0x7fffffff
+
+struct entry
+{
+ float32x4_t erf;
+ float32x4_t scale;
+};
+
+static inline struct entry
+lookup (uint32x4_t i)
+{
+ struct entry e;
+ float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+ float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+ float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+ float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+ float32x4_t e1 = vcombine_f32 (t0, t1);
+ float32x4_t e2 = vcombine_f32 (t2, t3);
+ e.erf = vuzp1q_f32 (e1, e2);
+ e.scale = vuzp2q_f32 (e1, e2);
+ return e;
+}
+
+/* Single-precision implementation of vector erf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2]
+
+ Values of erf(r) and scale are read from lookup tables.
+ For |x| > 3.9375, erf(|x|) rounds to 1.0f.
+
+ Maximum error: 1.93 ULP
+ _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
+ want 0x1.fd6868p-9. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ /* |x| < 2^-62. */
+ uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ float32x4_t a = vabsq_f32 (x);
+ uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max);
+
+ /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to
+ 2/sqrt(pi), when x reduced to r = 0. */
+ float32x4_t shift = dat->shift;
+ float32x4_t z = vaddq_f32 (a, shift);
+
+ uint32x4_t i
+ = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift));
+ i = vminq_u32 (i, v_u32 (512));
+ struct entry e = lookup (i);
+
+ float32x4_t r = vsubq_f32 (z, shift);
+
+ /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */
+ float32x4_t d = vsubq_f32 (a, r);
+ float32x4_t d2 = vmulq_f32 (d, d);
+ float32x4_t y = vfmaq_f32 (r, dat->third, d);
+ y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y));
+
+ /* Solves the |x| = inf case. */
+ y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y);
+
+ /* Copy sign. */
+ y = vbslq_f32 (v_u32 (AbsMask), y, x);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u32 (cmp)))
+ return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y);
+#endif
+ return y;
+}
+
+HALF_WIDTH_ALIAS_F1 (erf)
+
+TEST_SIG (V, F, 1, erf, -4.0, 4.0)
+TEST_ULP (V_NAME_F1 (erf), 1.43)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c
new file mode 100644
index 000000000000..a928c35c9418
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c
@@ -0,0 +1,134 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static volatile struct
+{
+ float64x2_t poly[3];
+ float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+#if !WANT_SIMD_EXCEPT
+ float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.88 +0.5 ulp
+ rel error: 1.4337*2^-53
+ abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
+ .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
+ V2 (0x1.55555da646206p-5) },
+#if !WANT_SIMD_EXCEPT
+ .scale_thresh = V2 (163840.0), /* 1280.0 * N. */
+ .special_bound = V2 (704.0),
+#endif
+ .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */
+ .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */
+ .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
+ .shift = V2 (0x1.8p+52)
+};
+
+#define C(i) data.poly[i]
+#define Tab __v_exp_data
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
+# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
+# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+ uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+ return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
+{
+ float64x2_t n, r, r2, s, y, z;
+ uint64x2_t cmp, u, e;
+
+#if WANT_SIMD_EXCEPT
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ float64x2_t xm = x;
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
+ if (unlikely (v_any_u64 (cmp)))
+ x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+ cmp = vcagtq_f64 (x, data.special_bound);
+#endif
+
+ /* n = round(x/(ln2/N)). */
+ z = vfmaq_f64 (data.shift, x, data.inv_ln2);
+ u = vreinterpretq_u64_f64 (z);
+ n = vsubq_f64 (z, data.shift);
+
+ /* r = x - n*ln2/N. */
+ r = x;
+ r = vfmsq_f64 (r, data.ln2_hi, n);
+ r = vfmsq_f64 (r, data.ln2_lo, n);
+
+ e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */
+ r2 = vmulq_f64 (r, r);
+ y = vfmaq_f64 (C (0), C (1), r);
+ y = vfmaq_f64 (y, C (2), r2);
+ y = vfmaq_f64 (r, y, r2);
+
+ /* s = 2^(n/N). */
+ u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
+ s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+ return special_case (s, y, n);
+#endif
+
+ return vfmaq_f64 (s, y, s);
+}
+
+TEST_SIG (V, D, 1, exp, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp), 1.9)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c
new file mode 100644
index 000000000000..24fdd1c7d257
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c
@@ -0,0 +1,147 @@
+/*
+ * Double-precision vector 10^x function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 163840.0 /* 1280.0 * N. */
+
+const static struct data
+{
+ float64x2_t poly[4];
+ float64x2_t log10_2, log2_10_hi, log2_10_lo, shift;
+#if !WANT_SIMD_EXCEPT
+ float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* Coefficients generated using Remez algorithm.
+ rel error: 0x1.5ddf8f28p-54
+ abs error: 0x1.5ed266c8p-54 in [ -log10(2)/256, log10(2)/256 ]
+ maxerr: 1.14432 +0.5 ulp. */
+ .poly = { V2 (0x1.26bb1bbb5524p1), V2 (0x1.53524c73cecdap1),
+ V2 (0x1.047060efb781cp1), V2 (0x1.2bd76040f0d16p0) },
+ .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */
+ .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N. */
+ .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66),
+ .shift = V2 (0x1.8p+52),
+#if !WANT_SIMD_EXCEPT
+ .scale_thresh = V2 (ScaleBound),
+ .special_bound = V2 (SpecialBound),
+#endif
+};
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask v_u64 (N - 1)
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
+# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */
+# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine for special lanes. */
+ return v_call_f64 (exp10, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+ const struct data *d)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+ uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+ return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+/* Fast vector implementation of exp10.
+ Maximum measured error is 1.64 ulp.
+ _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5
+ want 0x1.f8dab6d7fed0ap+5. */
+float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t cmp;
+#if WANT_SIMD_EXCEPT
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ float64x2_t xm = x;
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres);
+ if (unlikely (v_any_u64 (cmp)))
+ x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+ cmp = vcageq_f64 (x, d->special_bound);
+#endif
+
+ /* n = round(x/(log10(2)/N)). */
+ float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ /* r = x - n*log10(2)/N. */
+ float64x2_t r = x;
+ r = vfmsq_f64 (r, d->log2_10_hi, n);
+ r = vfmsq_f64 (r, d->log2_10_lo, n);
+
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+ uint64x2_t i = vandq_u64 (u, IndexMask);
+
+ /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t p = vfmaq_f64 (d->poly[0], r, d->poly[1]);
+ float64x2_t y = vfmaq_f64 (d->poly[2], r, d->poly[3]);
+ p = vfmaq_f64 (p, y, r2);
+ y = vmulq_f64 (r, p);
+
+ /* s = 2^(n/N). */
+ u = v_lookup_u64 (__v_exp_data, i);
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+ return special_case (s, y, n, d);
+#endif
+
+ return vfmaq_f64 (s, y, s);
+}
+
+#if WANT_EXP10_TESTS
+TEST_SIG (S, D, 1, exp10, -9.9, 9.9)
+TEST_SIG (V, D, 1, exp10, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp10), 1.15)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c
new file mode 100644
index 000000000000..eb0d5dd0d57c
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c
@@ -0,0 +1,147 @@
+/*
+ * Single-precision vector 10^x function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
+
+#define ScaleBound 192.0f
+
+static const struct data
+{
+ float32x4_t c0, c1, c3;
+ float log10_2_high, log10_2_low, c2, c4;
+ float32x4_t inv_log10_2, special_bound;
+ uint32x4_t exponent_bias, special_offset, special_bias;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t scale_thresh;
+#endif
+} data = {
+ /* Coefficients generated using Remez algorithm with minimisation of relative
+ error.
+ rel error: 0x1.89dafa3p-24
+ abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+ maxerr: 1.85943 +0.5 ulp. */
+ .c0 = V4 (0x1.26bb16p+1f),
+ .c1 = V4 (0x1.5350d2p+1f),
+ .c2 = 0x1.04744ap+1f,
+ .c3 = V4 (0x1.2d8176p+0f),
+ .c4 = 0x1.12b41ap-1f,
+ .inv_log10_2 = V4 (0x1.a934fp+1),
+ .log10_2_high = 0x1.344136p-2,
+ .log10_2_low = 0x1.ec10cp-27,
+ /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+ .special_bound = V4 (126.0f),
+ .exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
+#if !WANT_SIMD_EXCEPT
+ .scale_thresh = V4 (ScaleBound)
+#endif
+};
+
+#if WANT_SIMD_EXCEPT
+
+# define SpecialBound 38.0f /* rint(log10(2^127)). */
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */
+# define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f32 (exp10f, x, y, cmp);
+}
+
+#else
+
+# define SpecialBound 126.0f
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+/* Fast vector implementation of single-precision exp10.
+ Algorithm is accurate to 2.36 ULP.
+ _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
+ want 0x1.7e79cp+11. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+#if WANT_SIMD_EXCEPT
+ /* asuint(x) - TinyBound >= BigBound - TinyBound. */
+ uint32x4_t cmp = vcgeq_u32 (
+ vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = v_zerofy_f32 (x, cmp);
+#endif
+
+ /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+ with poly(r) in [1/sqrt(2), sqrt(2)] and
+ x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
+ float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
+ r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
+
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
+ float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
+ float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
+ float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
+
+HALF_WIDTH_ALIAS_F1 (exp10)
+
+#if WANT_EXP10_TESTS
+TEST_SIG (S, F, 1, exp10, -9.9, 9.9)
+TEST_SIG (V, F, 1, exp10, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp10), 1.86)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c
new file mode 100644
index 000000000000..63448d806b82
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c
@@ -0,0 +1,128 @@
+/*
+ * Double-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+#define BigBound 1022.0
+#define UOFlowBound 1280.0
+#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
+
+static const struct data
+{
+ float64x2_t poly[4];
+ float64x2_t shift, scale_big_bound, scale_uoflow_bound;
+} data = {
+ /* Coefficients are computed using Remez algorithm with
+ minimisation of the absolute error. */
+ .poly = { V2 (0x1.62e42fefa3686p-1), V2 (0x1.ebfbdff82c241p-3),
+ V2 (0x1.c6b09b16de99ap-5), V2 (0x1.3b2abf5571ad8p-7) },
+ .shift = V2 (0x1.8p52 / N),
+ .scale_big_bound = V2 (BigBound),
+ .scale_uoflow_bound = V2 (UOFlowBound),
+};
+
+static inline uint64x2_t
+lookup_sbits (uint64x2_t i)
+{
+ return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
+ __v_exp_data[i[1] & IndexMask] };
+}
+
+#if WANT_SIMD_EXCEPT
+
+# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
+
+/* Call scalar exp2 as a fallback. */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
+{
+ return v_call_f64 (exp2, x, y, is_special);
+}
+
+#else
+
+# define SpecialOffset 0x6000000000000000 /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+# define SpecialBias1 0x7000000000000000 /* 0x1p769. */
+# define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+ const struct data *d)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset));
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
+ uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1);
+ return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+/* Fast vector implementation of exp2.
+ Maximum measured error is 1.65 ulp.
+ _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
+ want 0x1.f8db0d4df721dp-1. */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t cmp;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
+ /* Mask special lanes and retain a copy of x for passing to special-case
+ handler. */
+ float64x2_t xc = x;
+ x = v_zerofy_f64 (x, cmp);
+#else
+ cmp = vcagtq_f64 (x, d->scale_big_bound);
+#endif
+
+ /* n = round(x/N). */
+ float64x2_t z = vaddq_f64 (d->shift, x);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ /* r = x - n/N. */
+ float64x2_t r = vsubq_f64 (x, n);
+
+ /* s = 2^(n/N). */
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+ u = lookup_sbits (u);
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ /* y ~ exp2(r) - 1. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly);
+ y = vmulq_f64 (r, y);
+
+ if (unlikely (v_any_u64 (cmp)))
+#if !WANT_SIMD_EXCEPT
+ return special_case (s, y, n, d);
+#else
+ return special_case (xc, vfmaq_f64 (s, s, y), cmp);
+#endif
+ return vfmaq_f64 (s, s, y);
+}
+
+TEST_SIG (V, D, 1, exp2, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp2), 1.15)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c
new file mode 100644
index 000000000000..40f6170d3702
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c
@@ -0,0 +1,122 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ float32x4_t c1, c3;
+ uint32x4_t exponent_bias, special_offset, special_bias;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t scale_thresh, special_bound;
+#endif
+ float c0, c2, c4, zero;
+} data = {
+ /* maxerr: 1.962 ulp. */
+ .c0 = 0x1.59977ap-10f,
+ .c1 = V4 (0x1.3ce9e4p-7f),
+ .c2 = 0x1.c6bd32p-5f,
+ .c3 = V4 (0x1.ebf9bcp-3f),
+ .c4 = 0x1.62e422p-1f,
+ .exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
+#if !WANT_SIMD_EXCEPT
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine for special lanes. */
+ return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ float32x4_t n = vrndaq_f32 (x);
+ float32x4_t r = vsubq_f32 (x, n);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+ float32x4_t c024 = vld1q_f32 (&d->c0);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_laneq_f32 (r, c024, 2);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
+
+HALF_WIDTH_ALIAS_F1 (exp2)
+
+TEST_SIG (V, F, 1, exp2, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp2), 1.49)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c
new file mode 100644
index 000000000000..1f8e89ab658f
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c
@@ -0,0 +1,73 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t c0, c1, c2, c3, c4, c5, shift;
+ uint32x4_t exponent_bias;
+ float32x4_t special_bound, scale_thresh;
+ uint32x4_t special_offset, special_bias;
+} data = {
+ .shift = V4 (0x1.8p23f),
+ .exponent_bias = V4 (0x3f800000),
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
+ /* maxerr: 0.878 ulp. */
+ .c0 = V4 (0x1.416b5ep-13f),
+ .c1 = V4 (0x1.5f082ep-10f),
+ .c2 = V4 (0x1.3b2dep-7f),
+ .c3 = V4 (0x1.c6af7cp-5f),
+ .c4 = V4 (0x1.ebfbdcp-3f),
+ .c5 = V4 (0x1.62e43p-1f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r1 = vmulq_f32 (s1, s1);
+ float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
+{
+ /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n = vrndaq_f32 (x);
+ float32x4_t r = x - n;
+ uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
+ float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+
+ float32x4_t p = vfmaq_f32 (d->c1, d->c0, r);
+ p = vfmaq_f32 (d->c2, p, r);
+ p = vfmaq_f32 (d->c3, p, r);
+ p = vfmaq_f32 (d->c4, p, r);
+ p = vfmaq_f32 (d->c5, p, r);
+ p = vfmaq_f32 (v_f32 (1.0f), p, r);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (p, n, e, d);
+ return scale * p;
+}
+
+TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4)
+TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u)
+TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c
new file mode 100644
index 000000000000..e5b1f020d1a0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c
@@ -0,0 +1,130 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ float32x4_t c1, c3, c4, inv_ln2;
+ float ln2_hi, ln2_lo, c0, c2;
+ uint32x4_t exponent_bias, special_offset, special_bias;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.45358 +0.5 ulp. */
+ .c0 = 0x1.0e4020p-7f,
+ .c1 = V4 (0x1.573e2ep-5f),
+ .c2 = 0x1.555e66p-3f,
+ .c3 = V4 (0x1.fffdb6p-2f),
+ .c4 = V4 (0x1.ffffecp-1f),
+ .inv_ln2 = V4 (0x1.715476p+0f),
+ .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
+ .exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
+#if !WANT_SIMD_EXCEPT
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ // (s2 + p*s2)*s1 = s2(p+1)s1
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+
+#if WANT_SIMD_EXCEPT
+ /* asuint(x) - TinyBound >= BigBound - TinyBound. */
+ uint32x4_t cmp = vcgeq_u32 (
+ vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
+ TinyBound),
+ SpecialBound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (d->c4, r);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
+
+HALF_WIDTH_ALIAS_F1 (exp)
+
+TEST_SIG (V, F, 1, exp, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp), 1.49)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c
new file mode 100644
index 000000000000..4e114d810e08
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c
@@ -0,0 +1,79 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t shift, inv_ln2;
+ uint32x4_t exponent_bias;
+ float32x4_t c1, c2, c3, c4;
+ float32x4_t special_bound, scale_thresh;
+ uint32x4_t special_offset, special_bias;
+ float ln2_hi, ln2_lo, c0, nothing;
+} data = {
+ .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
+ .shift = V4 (0x1.8p23f),
+ .inv_ln2 = V4 (0x1.715476p+0f),
+ .exponent_bias = V4 (0x3f800000),
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+ .special_offset = V4 (0x83000000),
+ .special_bias = V4 (0x7f000000),
+ /* maxerr: 0.36565 +0.5 ulp. */
+ .c0 = 0x1.6a6000p-10f,
+ .c1 = V4 (0x1.12718ep-7f),
+ .c2 = V4 (0x1.555af0p-5f),
+ .c3 = V4 (0x1.555430p-3f),
+ .c4 = V4 (0x1.fffff4p-2f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r1 = vmulq_f32 (s1, s1);
+ float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_expf_1u (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi);
+
+ /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t z = vmulq_f32 (x, d->inv_ln2);
+ float32x4_t n = vrndaq_f32 (z);
+ float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c0, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2);
+ p = vfmaq_f32 (d->c2, p, r);
+ p = vfmaq_f32 (d->c3, p, r);
+ p = vfmaq_f32 (d->c4, p, r);
+ p = vfmaq_f32 (v_f32 (1.0f), p, r);
+ p = vfmaq_f32 (v_f32 (1.0f), p, r);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (p, n, e, d);
+ return scale * p;
+}
+
+TEST_ULP (_ZGVnN4v_expf_1u, 0.4)
+TEST_DISABLE_FENV (_ZGVnN4v_expf_1u)
+TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c
new file mode 100644
index 000000000000..7535a1830427
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c
@@ -0,0 +1,77 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+ struct v_expm1_data d;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t thresh, tiny_bound;
+#else
+ float64x2_t oflow_bound;
+#endif
+} data = {
+ .d = V_EXPM1_DATA,
+#if WANT_SIMD_EXCEPT
+ /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
+ compare. */
+ .thresh = V2 (0x78c56fa6d34b552),
+ /* asuint64(0x1p-51) << 1. */
+ .tiny_bound = V2 (0x3cc0000000000000 << 1),
+#else
+ /* Value above which expm1(x) should overflow. Absolute value of the
+ underflow bound is greater than this, so it catches both cases - there is
+ a small window where fallbacks are triggered unnecessarily. */
+ .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
+#endif
+};
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, const struct data *d)
+{
+ return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
+ special);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+ The maximum error observed error is 2.05 ULP:
+ _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2
+ want 0x1.a8897eef87b32p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
+ |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+ shift-left by 1, and compare with thresh which was left-shifted offline -
+ this is effectively an absolute compare. */
+ uint64x2_t special
+ = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
+#else
+ /* Large input, NaNs and Infs. */
+ uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
+#endif
+
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, special, d);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return expm1_inline (x, &d->d);
+}
+
+TEST_SIG (V, D, 1, expm1, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (expm1), 1.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c
new file mode 100644
index 000000000000..6d4431dcd8a5
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+ struct v_expm1f_data d;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t thresh;
+#else
+ float32x4_t oflow_bound;
+#endif
+} data = {
+ .d = V_EXPM1F_DATA,
+#if !WANT_SIMD_EXCEPT
+ /* Value above which expm1f(x) should overflow. Absolute value of the
+ underflow bound is greater than this, so it catches both cases - there is
+ a small window where fallbacks are triggered unnecessarily. */
+ .oflow_bound = V4 (0x1.5ebc4p+6),
+#else
+ /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
+ compare. */
+ .thresh = V4 (0x1d5ebc40),
+#endif
+};
+
+/* asuint(0x1p-23), shifted by 1 for abs compare. */
+#define TinyBound v_u32 (0x34000000 << 1)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
+{
+ return v_call_f32 (
+ expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
+}
+
+/* Single-precision vector exp(x) - 1 function.
+ The maximum error is 1.62 ULP:
+ _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+ want 0x1.da9f44p-2. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
+ |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+ shift-left by 1, and compare with thresh which was left-shifted offline -
+ this is effectively an absolute compare. */
+ uint32x4_t special
+ = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
+#else
+ /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
+ uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
+#endif
+
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, special, d);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return expm1f_inline (x, &d->d);
+}
+
+HALF_WIDTH_ALIAS_F1 (expm1)
+
+TEST_SIG (V, F, 1, expm1, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (expm1), 1.13)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000)
+TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h
new file mode 100644
index 000000000000..0c8350a1a77b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h
@@ -0,0 +1,361 @@
+/*
+ * Double-precision x^y function.
+ *
+ * Copyright (c) 2018-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Scalar version of pow used for fallbacks in vector implementations. */
+
+/* Data is defined in v_pow_log_data.c. */
+#define N_LOG (1 << V_POW_LOG_TABLE_BITS)
+#define Off 0x3fe6955500000000
+#define As __v_pow_log_data.poly
+
+/* Data is defined in v_pow_exp_data.c. */
+#define N_EXP (1 << V_POW_EXP_TABLE_BITS)
+#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
+#define SmallExp 0x3c9 /* top12(0x1p-54). */
+#define BigExp 0x408 /* top12(512.0). */
+#define ThresExp 0x03f /* BigExp - SmallExp. */
+#define InvLn2N __v_pow_exp_data.n_over_ln2
+#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi
+#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo
+#define SBits __v_pow_exp_data.sbits
+#define Cs __v_pow_exp_data.poly
+
+/* Constants associated with pow. */
+#define SmallPowX 0x001 /* top12(0x1p-126). */
+#define BigPowX 0x7ff /* top12(INFINITY). */
+#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
+#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */
+#define BigPowY 0x43e /* top12(0x1.749p62). */
+#define ThresPowY 0x080 /* BigPowY - SmallPowY. */
+
+/* Top 12 bits of a double (sign and exponent bits). */
+static inline uint32_t
+top12 (double x)
+{
+ return asuint64 (x) >> 52;
+}
+
+/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
+ additional 15 bits precision. IX is the bit representation of x, but
+ normalized in the subnormal range using the sign bit for the exponent. */
+static inline double
+log_inline (uint64_t ix, double *tail)
+{
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ uint64_t tmp = ix - Off;
+ int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1);
+ int k = (int64_t) tmp >> 52; /* arithmetic shift. */
+ uint64_t iz = ix - (tmp & 0xfffULL << 52);
+ double z = asdouble (iz);
+ double kd = (double) k;
+
+ /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
+ double invc = __v_pow_log_data.invc[i];
+ double logc = __v_pow_log_data.logc[i];
+ double logctail = __v_pow_log_data.logctail[i];
+
+ /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
+ |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
+ double r = fma (z, invc, -1.0);
+
+ /* k*Ln2 + log(c) + r. */
+ double t1 = kd * __v_pow_log_data.ln2_hi + logc;
+ double t2 = t1 + r;
+ double lo1 = kd * __v_pow_log_data.ln2_lo + logctail;
+ double lo2 = t1 - t2 + r;
+
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ double ar = As[0] * r;
+ double ar2 = r * ar;
+ double ar3 = r * ar2;
+ /* k*Ln2 + log(c) + r + A[0]*r*r. */
+ double hi = t2 + ar2;
+ double lo3 = fma (ar, r, -ar2);
+ double lo4 = t2 - hi + ar2;
+ /* p = log1p(r) - r - A[0]*r*r. */
+ double p = (ar3
+ * (As[1] + r * As[2]
+ + ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6]))));
+ double lo = lo1 + lo2 + lo3 + lo4 + p;
+ double y = hi + lo;
+ *tail = hi - y + lo;
+ return y;
+}
+
+/* Handle cases that may overflow or underflow when computing the result that
+ is scale*(1+TMP) without intermediate rounding. The bit representation of
+ scale is in SBITS, however it has a computed exponent that may have
+ overflown into the sign bit so that needs to be adjusted before using it as
+ a double. (int32_t)KI is the k used in the argument reduction and exponent
+ adjustment of scale, positive k here means the result may overflow and
+ negative k means the result may underflow. */
+static inline double
+special_case (double tmp, uint64_t sbits, uint64_t ki)
+{
+ double scale, y;
+
+ if ((ki & 0x80000000) == 0)
+ {
+ /* k > 0, the exponent of scale might have overflowed by <= 460. */
+ sbits -= 1009ull << 52;
+ scale = asdouble (sbits);
+ y = 0x1p1009 * (scale + scale * tmp);
+ return y;
+ }
+ /* k < 0, need special care in the subnormal range. */
+ sbits += 1022ull << 52;
+ /* Note: sbits is signed scale. */
+ scale = asdouble (sbits);
+ y = scale + scale * tmp;
+#if WANT_SIMD_EXCEPT
+ if (fabs (y) < 1.0)
+ {
+ /* Round y to the right precision before scaling it into the subnormal
+ range to avoid double rounding that can cause 0.5+E/2 ulp error where
+ E is the worst-case ulp error outside the subnormal range. So this
+ is only useful if the goal is better than 1 ulp worst-case error. */
+ double hi, lo, one = 1.0;
+ if (y < 0.0)
+ one = -1.0;
+ lo = scale - y + scale * tmp;
+ hi = one + y;
+ lo = one - hi + y + lo;
+ y = (hi + lo) - one;
+ /* Fix the sign of 0. */
+ if (y == 0.0)
+ y = asdouble (sbits & 0x8000000000000000);
+ /* The underflow exception needs to be signaled explicitly. */
+ force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+ }
+#endif
+ y = 0x1p-1022 * y;
+ return y;
+}
+
+/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+ The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
+static inline double
+exp_inline (double x, double xtail, uint32_t sign_bias)
+{
+ uint32_t abstop = top12 (x) & 0x7ff;
+ if (unlikely (abstop - SmallExp >= ThresExp))
+ {
+ if (abstop - SmallExp >= 0x80000000)
+ {
+ /* Avoid spurious underflow for tiny x. */
+ /* Note: 0 is common input. */
+ return sign_bias ? -1.0 : 1.0;
+ }
+ if (abstop >= top12 (1024.0))
+ {
+ /* Note: inf and nan are already handled. */
+ /* Skip errno handling. */
+#if WANT_SIMD_EXCEPT
+ return asuint64 (x) >> 63 ? __math_uflow (sign_bias)
+ : __math_oflow (sign_bias);
+#else
+ double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY;
+ return sign_bias ? -res_uoflow : res_uoflow;
+#endif
+ }
+ /* Large x is special cased below. */
+ abstop = 0;
+ }
+
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
+ double z = InvLn2N * x;
+ double kd = round (z);
+ uint64_t ki = lround (z);
+ double r = x - kd * Ln2HiN - kd * Ln2LoN;
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r += xtail;
+ /* 2^(k/N) ~= scale. */
+ uint64_t idx = ki & (N_EXP - 1);
+ uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ uint64_t sbits = SBits[idx] + top;
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ double r2 = r * r;
+ double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]);
+ if (unlikely (abstop == 0))
+ return special_case (tmp, sbits, ki);
+ double scale = asdouble (sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ return scale + scale * tmp;
+}
+
+/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+ A version of exp_inline that is not inlined and for which sign_bias is
+ equal to 0. */
+static double NOINLINE
+exp_nosignbias (double x, double xtail)
+{
+ uint32_t abstop = top12 (x) & 0x7ff;
+ if (unlikely (abstop - SmallExp >= ThresExp))
+ {
+ /* Avoid spurious underflow for tiny x. */
+ if (abstop - SmallExp >= 0x80000000)
+ return 1.0;
+ /* Note: inf and nan are already handled. */
+ if (abstop >= top12 (1024.0))
+#if WANT_SIMD_EXCEPT
+ return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0);
+#else
+ return asuint64 (x) >> 63 ? 0.0 : INFINITY;
+#endif
+ /* Large x is special cased below. */
+ abstop = 0;
+ }
+
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
+ double z = InvLn2N * x;
+ double kd = round (z);
+ uint64_t ki = lround (z);
+ double r = x - kd * Ln2HiN - kd * Ln2LoN;
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r += xtail;
+ /* 2^(k/N) ~= scale. */
+ uint64_t idx = ki & (N_EXP - 1);
+ uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ uint64_t sbits = SBits[idx] + top;
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */
+ double r2 = r * r;
+ double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]);
+ if (unlikely (abstop == 0))
+ return special_case (tmp, sbits, ki);
+ double scale = asdouble (sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ return scale + scale * tmp;
+}
+
+/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
+ the bit representation of a non-zero finite floating-point value. */
+static inline int
+checkint (uint64_t iy)
+{
+ int e = iy >> 52 & 0x7ff;
+ if (e < 0x3ff)
+ return 0;
+ if (e > 0x3ff + 52)
+ return 2;
+ if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
+ return 0;
+ if (iy & (1ULL << (0x3ff + 52 - e)))
+ return 1;
+ return 2;
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline int
+zeroinfnan (uint64_t i)
+{
+ return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
+}
+
+static double NOINLINE
+pow_scalar_special_case (double x, double y)
+{
+ uint32_t sign_bias = 0;
+ uint64_t ix, iy;
+ uint32_t topx, topy;
+
+ ix = asuint64 (x);
+ iy = asuint64 (y);
+ topx = top12 (x);
+ topy = top12 (y);
+ if (unlikely (topx - SmallPowX >= ThresPowX
+ || (topy & 0x7ff) - SmallPowY >= ThresPowY))
+ {
+ /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0
+ and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */
+ /* Special cases: (x < 0x1p-126 or inf or nan) or
+ (|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */
+ if (unlikely (zeroinfnan (iy)))
+ {
+ if (2 * iy == 0)
+ return issignaling_inline (x) ? x + y : 1.0;
+ if (ix == asuint64 (1.0))
+ return issignaling_inline (y) ? x + y : 1.0;
+ if (2 * ix > 2 * asuint64 (INFINITY)
+ || 2 * iy > 2 * asuint64 (INFINITY))
+ return x + y;
+ if (2 * ix == 2 * asuint64 (1.0))
+ return 1.0;
+ if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
+ return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */
+ return y * y;
+ }
+ if (unlikely (zeroinfnan (ix)))
+ {
+ double x2 = x * x;
+ if (ix >> 63 && checkint (iy) == 1)
+ {
+ x2 = -x2;
+ sign_bias = 1;
+ }
+#if WANT_SIMD_EXCEPT
+ if (2 * ix == 0 && iy >> 63)
+ return __math_divzero (sign_bias);
+#endif
+ return iy >> 63 ? 1 / x2 : x2;
+ }
+ /* Here x and y are non-zero finite. */
+ if (ix >> 63)
+ {
+ /* Finite x < 0. */
+ int yint = checkint (iy);
+ if (yint == 0)
+#if WANT_SIMD_EXCEPT
+ return __math_invalid (x);
+#else
+ return __builtin_nan ("");
+#endif
+ if (yint == 1)
+ sign_bias = SignBias;
+ ix &= 0x7fffffffffffffff;
+ topx &= 0x7ff;
+ }
+ if ((topy & 0x7ff) - SmallPowY >= ThresPowY)
+ {
+ /* Note: sign_bias == 0 here because y is not odd. */
+ if (ix == asuint64 (1.0))
+ return 1.0;
+ /* |y| < 2^-65, x^y ~= 1 + y*log(x). */
+ if ((topy & 0x7ff) < SmallPowY)
+ return 1.0;
+#if WANT_SIMD_EXCEPT
+ return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0)
+ : __math_uflow (0);
+#else
+ return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0;
+#endif
+ }
+ if (topx == 0)
+ {
+ /* Normalize subnormal x so exponent becomes negative. */
+ ix = asuint64 (x * 0x1p52);
+ ix &= 0x7fffffffffffffff;
+ ix -= 52ULL << 52;
+ }
+ }
+
+ double lo;
+ double hi = log_inline (ix, &lo);
+ double ehi = y * hi;
+ double elo = y * lo + fma (y, hi, -ehi);
+ return exp_inline (ehi, elo, sign_bias);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c
new file mode 100644
index 000000000000..dc01ed5bac93
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c
@@ -0,0 +1,95 @@
+/*
+ * Double-precision vector hypot(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+ uint64x2_t tiny_bound, thres;
+} data = {
+ .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */
+ .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
+};
+#else
+static const struct data
+{
+ uint64x2_t tiny_bound;
+ uint32x4_t thres;
+} data = {
+ .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */
+ .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
+};
+#endif
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
+ uint32x2_t special)
+{
+ return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special));
+}
+
+/* Vector implementation of double-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222)
+ got 0x1.6a1b19400964ep-204
+ want 0x1.6a1b19400964dp-204. */
+#if WANT_SIMD_EXCEPT
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ float64x2_t ay = vabsq_f64 (y);
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (ax);
+ uint64x2_t iy = vreinterpretq_u64_f64 (ay);
+
+ /* Extreme values, NaNs, and infinities should be handled by the scalar
+ fallback for correct flag handling. */
+ uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres);
+ uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres);
+ ax = v_zerofy_f64 (ax, specialx);
+ ay = v_zerofy_f64 (ay, specialy);
+ uint32x2_t special = vaddhn_u64 (specialx, specialy);
+
+ float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f64 (sqsum);
+}
+#else
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
+
+ uint32x2_t special
+ = vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
+ vget_low_u32 (d->thres));
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f64 (sqsum);
+}
+#endif
+
+TEST_SIG (V, D, 2, hypot, -10.0, 10.0)
+TEST_ULP (V_NAME_D2 (hypot), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c
new file mode 100644
index 000000000000..69634875be5a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c
@@ -0,0 +1,96 @@
+/*
+ * Single-precision vector hypot(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+ uint32x4_t tiny_bound, thres;
+} data = {
+ .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */
+ .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
+};
+#else
+static const struct data
+{
+ uint32x4_t tiny_bound;
+ uint16x8_t thres;
+} data = {
+ .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */
+ .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
+};
+#endif
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
+ uint16x4_t special)
+{
+ return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special));
+}
+
+/* Vector implementation of single-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13
+ want 0x1.6a41dp-13. */
+#if WANT_SIMD_EXCEPT
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ay = vabsq_f32 (y);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (ax);
+ uint32x4_t iy = vreinterpretq_u32_f32 (ay);
+
+ /* Extreme values, NaNs, and infinities should be handled by the scalar
+ fallback for correct flag handling. */
+ uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres);
+ uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres);
+ ax = v_zerofy_f32 (ax, specialx);
+ ay = v_zerofy_f32 (ay, specialy);
+ uint16x4_t special = vaddhn_u32 (specialx, specialy);
+
+ float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay);
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f32 (sqsum);
+}
+#else
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
+
+ uint16x4_t special
+ = vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
+ vget_low_u16 (d->thres));
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f32 (sqsum);
+}
+#endif
+
+HALF_WIDTH_ALIAS_F2 (hypot)
+
+TEST_SIG (V, F, 2, hypot, -10.0, 10.0)
+TEST_ULP (V_NAME_F2 (hypot), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c
new file mode 100644
index 000000000000..94e3f4482079
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c
@@ -0,0 +1,118 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
+ uint32x4_t special_bound;
+ float64x2_t c0, c2;
+ double c1, c3, ln2, c4;
+} data = {
+ /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
+ .c0 = V2 (-0x1.ffffffffffff7p-2),
+ .c1 = 0x1.55555555170d4p-2,
+ .c2 = V2 (-0x1.0000000399c27p-2),
+ .c3 = 0x1.999b2e90e94cap-3,
+ .c4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+ .sign_exp_mask = V2 (0xfff0000000000000),
+ .off = V2 (0x3fe6900900000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */
+};
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
+{
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Double-precision vector log routine.
+ The maximum observed error is 2.17 ULP:
+ _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+ want 0x1.ffffff1cca045p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
+ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
+ y = vfmaq_f64 (p, r2, y);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (hi, u_off, y, r2, special, d);
+ return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log), 1.67)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000)
+TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000)
+TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c
new file mode 100644
index 000000000000..c2b8f1c54f0e
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c
@@ -0,0 +1,132 @@
+/*
+ * Double-precision vector log10(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
+ uint32x4_t special_bound;
+ double invln10, log10_2;
+ double c1, c3;
+ float64x2_t c0, c2, c4;
+} data = {
+ /* Computed from log coefficients divided by log(10) then rounded to double
+ precision. */
+ .c0 = V2 (-0x1.bcb7b1526e506p-3),
+ .c1 = 0x1.287a7636be1d1p-3,
+ .c2 = V2 (-0x1.bcb7b158af938p-4),
+ .c3 = 0x1.63c78734e6d07p-4,
+ .c4 = V2 (-0x1.287461742fee4p-4),
+ .invln10 = 0x1.bcb7b1526e50ep-2,
+ .log10_2 = 0x1.34413509f79ffp-2,
+ .off = V2 (0x3fe6900900000000),
+ .sign_exp_mask = V2 (0xfff0000000000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
+};
+
+#define N (1 << V_LOG10_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t log10c;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ struct entry e;
+ uint64_t i0
+ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+ uint64_t i1
+ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.log10c = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
+{
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Fast implementation of double-precision vector log10
+ is a slight modification of double-precision vector log.
+ Max ULP error: < 2.5 ulp (nearest rounding.)
+ Maximum measured at 2.46 ulp for x in [0.96, 0.97]
+ _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+ want 0x1.fff6be3cae4b9p-6. */
+float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
+
+ /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+
+ /* hi = r / log(10) + log10(c) + k*log10(2).
+ Constants in v_log10_data.c are computed (in extended precision) as
+ e.log10c := e.logc * invln10. */
+ float64x2_t cte = vld1q_f64 (&d->invln10);
+ float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
+
+ /* y = log10(1+r) + n * log10(2). */
+ hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_f64 (y, d->c4, r2);
+ y = vfmaq_f64 (p, y, r2);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (hi, u_off, y, r2, special, d);
+ return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log10, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log10), 1.97)
+TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c
new file mode 100644
index 000000000000..907c1051e086
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c
@@ -0,0 +1,106 @@
+/*
+ * Single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
+ uint32x4_t off, offset_lower_bound;
+ uint16x8_t special_bound;
+ uint32x4_t mantissa_mask;
+ float c1, c3, c5, c7;
+} data = {
+ /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+ [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
+ .c0 = V4 (-0x1.bcb79cp-3f),
+ .c1 = 0x1.2879c8p-3f,
+ .c2 = V4 (-0x1.bcd472p-4f),
+ .c3 = 0x1.6408f8p-4f,
+ .c4 = V4 (-0x1.246f8p-4f),
+ .c5 = 0x1.f0e514p-5f,
+ .c6 = V4 (-0x1.0fc92cp-4f),
+ .c7 = 0x1.f5f76ap-5f,
+ .ln2 = V4 (0x1.62e43p-1f),
+ .inv_ln10 = V4 (0x1.bcb7b2p-2f),
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
+ uint16x4_t cmp, const struct data *d)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
+}
+
+/* Fast implementation of AdvSIMD log10f,
+ uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and
+ an order 9 polynomial.
+ Maximum error: 3.305ulps (nearest rounding.)
+ _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+ want 0x1.ffe2f4p-4. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ u_off = vsubq_u32 (u_off, d->off);
+ float32x4_t n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log10(1+r) + n * log10(2). */
+ float32x4_t r2 = vmulq_f32 (r, r);
+
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+
+ float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
+ float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
+ float32x4_t poly = vfmaq_f32 (c01, r2, p27);
+
+ /* y = Log10(2) * n + poly * InvLn(10). */
+ float32x4_t y = vfmaq_f32 (r, d->ln2, n);
+ y = vmulq_f32 (y, d->inv_ln10);
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (y, u_off, poly, r2, special, d);
+ return vfmaq_f32 (y, poly, r2);
+}
+
+HALF_WIDTH_ALIAS_F1 (log10)
+
+TEST_SIG (V, F, 1, log10, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log10), 2.81)
+TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100)
+TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100)
+TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c
new file mode 100644
index 000000000000..42a0c5793920
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c
@@ -0,0 +1,61 @@
+/*
+ * Double-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+ struct v_log1p_data d;
+ uint64x2_t inf, minus_one;
+} data = { .d = V_LOG1P_CONSTANTS_TABLE,
+ .inf = V2 (0x7ff0000000000000),
+ .minus_one = V2 (0xbff0000000000000) };
+
+#define BottomMask v_u64 (0xffffffff)
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
+{
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
+ return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
+}
+
+/* Vector log1p approximation using polynomial on reduced interval. Routine is
+ a modification of the algorithm used in scalar log1p, with no shortcut for
+ k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
+ _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
+ want 0x1.fd61d0727429fp+2 . */
+VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+ uint64x2_t special_cases
+ = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
+
+ if (unlikely (v_any_u64 (special_cases)))
+ return special_case (x, special_cases, d);
+
+ return log1p_inline (x, &d->d);
+}
+
+TEST_SIG (V, D, 1, log1p, -0.9, 10.0)
+TEST_ULP (V_NAME_D1 (log1p), 1.95)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000)
+TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c
new file mode 100644
index 000000000000..94b90249128f
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c
@@ -0,0 +1,92 @@
+/*
+ * Single-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
+
+const static struct data
+{
+ uint32x4_t minus_one, thresh;
+ struct v_log1pf_data d;
+} data = {
+ .d = V_LOG1PF_CONSTANTS_TABLE,
+ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
+ .minus_one = V4 (0xbf800000),
+};
+
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
+# define TinyBound v_u32 (0x34000000)
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
+{
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+ return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.69 ULP:
+ _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+ want 0x1.cfcbdcp-3. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+ uint32x4_t special_cases
+ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
+ vcgeq_u32 (ix, d->minus_one));
+
+ if (unlikely (v_any_u32 (special_cases)))
+ return special_case (x, special_cases, d);
+
+ return log1pf_inline (x, &d->d);
+}
+
+#else
+
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+ return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.63 ULP:
+ _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+ want 0x1.fdcb16p-3. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
+{
+ uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+ vcaleq_f32 (x, v_f32 (0x1p127f)));
+
+ if (unlikely (v_any_u32 (special_cases)))
+ return special_case (x, special_cases);
+
+ return log1pf_inline (x, ptr_barrier (&data));
+}
+
+#endif
+
+HALF_WIDTH_ALIAS_F1 (log1p)
+
+TEST_SIG (V, F, 1, log1p, -0.9, 10.0)
+TEST_ULP (V_NAME_F1 (log1p), 1.20)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000)
+TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000)
+TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000)
+TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c
new file mode 100644
index 000000000000..7d2e44dad2c9
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c
@@ -0,0 +1,123 @@
+/*
+ * Double-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
+ uint32x4_t special_bound;
+ float64x2_t c0, c2;
+ double c1, c3, invln2, c4;
+} data = {
+ /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
+ and N = 128, then scaled by log2(e) in extended precision and rounded back
+ to double precision. */
+ .c0 = V2 (-0x1.71547652b8300p-1),
+ .c1 = 0x1.ec709dc340953p-2,
+ .c2 = V2 (-0x1.71547651c8f35p-2),
+ .c3 = 0x1.2777ebe12dda5p-2,
+ .c4 = -0x1.ec738d616fe26p-3,
+ .invln2 = 0x1.71547652b82fep0,
+ .off = V2 (0x3fe6900900000000),
+ .sign_exp_mask = V2 (0xfff0000000000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */
+};
+
+#define N (1 << V_LOG2_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t log2c;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ struct entry e;
+ uint64_t i0
+ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+ uint64_t i1
+ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.log2c = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
+{
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Double-precision vector log2 routine. Implements the same algorithm as
+ vector log10, with coefficients and table entries scaled in extended
+ precision. The maximum observed error is 2.58 ULP:
+ _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+ want 0x1.fffb34198d9ddp-5. */
+float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
+
+ /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+
+ float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
+ float64x2_t hi
+ = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
+
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
+ y = vfmaq_f64 (p, r2, y);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (hi, u_off, y, r2, special, d);
+ return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log2, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log2), 2.09)
+TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c
new file mode 100644
index 000000000000..3053c64bc552
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c
@@ -0,0 +1,102 @@
+/*
+ * Single-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t c0, c2, c4, c6, c8;
+ uint32x4_t off, offset_lower_bound;
+ uint16x8_t special_bound;
+ uint32x4_t mantissa_mask;
+ float c1, c3, c5, c7;
+} data = {
+ /* Coefficients generated using Remez algorithm approximate
+ log2(1+r)/r for r in [ -1/3, 1/3 ].
+ rel error: 0x1.c4c4b0cp-26. */
+ .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
+ .c1 = -0x1.715458p-1f,
+ .c2 = V4 (0x1.ec701cp-2f),
+ .c3 = -0x1.7171a4p-2f,
+ .c4 = V4 (0x1.27a0b8p-2f),
+ .c5 = -0x1.e5143ep-3f,
+ .c6 = V4 (0x1.9d8ecap-3f),
+ .c7 = -0x1.c675bp-3f,
+ .c8 = V4 (0x1.9e495p-3f),
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
+ uint16x4_t cmp, const struct data *d)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+}
+
+/* Fast implementation for single precision AdvSIMD log2,
+ relies on same argument reduction as AdvSIMD logf.
+ Maximum error: 2.48 ULPs
+ _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+ want 0x1.a9be8p-2. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ u_off = vsubq_u32 (u_off, d->off);
+ float32x4_t n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log2(1+r) + n. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+ float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
+ float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
+ float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
+ float32x4_t p = vfmaq_f32 (c01, r2, p28);
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (n, u_off, p, r, special, d);
+ return vfmaq_f32 (n, p, r);
+}
+
+HALF_WIDTH_ALIAS_F1 (log2)
+
+TEST_SIG (V, F, 1, log2, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log2), 1.99)
+TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c
new file mode 100644
index 000000000000..84705fad05ee
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ float32x4_t c2, c4, c6, ln2;
+ uint32x4_t off, offset_lower_bound, mantissa_mask;
+ uint16x8_t special_bound;
+ float c1, c3, c5, c0;
+} data = {
+ /* 3.34 ulp error. */
+ .c0 = -0x1.3e737cp-3f,
+ .c1 = 0x1.5a9aa2p-3f,
+ .c2 = V4 (-0x1.4f9934p-3f),
+ .c3 = 0x1.961348p-3f,
+ .c4 = V4 (-0x1.00187cp-2f),
+ .c5 = 0x1.555d7cp-2f,
+ .c6 = V4 (-0x1.ffffc8p-2f),
+ .ln2 = V4 (0x1.62e43p-1f),
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff)
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+ uint16x4_t cmp, const struct data *d)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t c1350 = vld1q_f32 (&d->c1);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ float32x4_t n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+ uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log(1+r) + n*ln2. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
+ float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
+ float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
+ float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
+ p = vfmaq_laneq_f32 (p, r2, c1350, 3);
+
+ q = vfmaq_f32 (q, p, r2);
+ y = vfmaq_f32 (y, q, r2);
+ p = vfmaq_f32 (r, d->ln2, n);
+
+ if (unlikely (v_any_u16h (cmp)))
+ return special_case (p, u_off, y, r2, cmp, d);
+ return vfmaq_f32 (p, y, r2);
+}
+
+HALF_WIDTH_ALIAS_F1 (log)
+
+TEST_SIG (V, F, 1, log, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log), 2.9)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000)
+TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000)
+TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c
new file mode 100644
index 000000000000..da2fcbff8514
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c
@@ -0,0 +1,33 @@
+/*
+ * Double-precision vector modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modf algorithm. Produces exact values in all rounding modes. */
+float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int)
+{
+ /* Get integer component of x. */
+ float64x2_t rounded = vrndq_f64 (x);
+ vst1q_f64 (out_int, rounded);
+
+ /* Subtract integer component from input. */
+ uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded));
+
+ /* Return +0 for integer x. */
+ uint64x2_t is_integer = vceqq_f64 (x, rounded);
+ return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer));
+}
+
+TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVnN2vl8_modf_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c
new file mode 100644
index 000000000000..0a646b24cb1a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c
@@ -0,0 +1,34 @@
+/*
+ * Single-precision vector modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modff algorithm. Produces exact values in all rounding modes. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x,
+ float *out_int)
+{
+ /* Get integer component of x. */
+ float32x4_t rounded = vrndq_f32 (x);
+ vst1q_f32 (out_int, rounded);
+
+ /* Subtract integer component from input. */
+ uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded));
+
+ /* Return +0 for integer x. */
+ uint32x4_t is_integer = vceqq_f32 (x, rounded);
+ return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer));
+}
+
+TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVnN4vl4_modff_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c
new file mode 100644
index 000000000000..db9d6e9ba14b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c
@@ -0,0 +1,284 @@
+/*
+ * Double-precision vector pow function.
+ *
+ * Copyright (c) 2020-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Defines parameters of the approximation and scalar fallback. */
+#include "finite_pow.h"
+
+#define VecSmallPowX v_u64 (SmallPowX)
+#define VecThresPowX v_u64 (ThresPowX)
+#define VecSmallPowY v_u64 (SmallPowY)
+#define VecThresPowY v_u64 (ThresPowY)
+
+static const struct data
+{
+ uint64x2_t inf;
+ float64x2_t small_powx;
+ uint64x2_t offset, mask;
+ uint64x2_t mask_sub_0, mask_sub_1;
+ float64x2_t log_c0, log_c2, log_c4, log_c5;
+ double log_c1, log_c3;
+ double ln2_lo, ln2_hi;
+ uint64x2_t small_exp, thres_exp;
+ double ln2_lo_n, ln2_hi_n;
+ double inv_ln2_n, exp_c2;
+ float64x2_t exp_c0, exp_c1;
+} data = {
+ /* Power threshold. */
+ .inf = V2 (0x7ff0000000000000),
+ .small_powx = V2 (0x1p-126),
+ .offset = V2 (Off),
+ .mask = V2 (0xfffULL << 52),
+ .mask_sub_0 = V2 (1ULL << 52),
+ .mask_sub_1 = V2 (52ULL << 52),
+ /* Coefficients copied from v_pow_log_data.c
+ relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
+ Coefficients are scaled to match the scaling during evaluation. */
+ .log_c0 = V2 (0x1.555555555556p-2 * -2),
+ .log_c1 = -0x1.0000000000006p-2 * -2,
+ .log_c2 = V2 (0x1.999999959554ep-3 * 4),
+ .log_c3 = -0x1.555555529a47ap-3 * 4,
+ .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
+ .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
+ (0.550 without fma) if |x| < ln2/512. */
+ .exp_c0 = V2 (0x1.fffffffffffd4p-2),
+ .exp_c1 = V2 (0x1.5555571d6ef9p-3),
+ .exp_c2 = 0x1.5555576a5adcep-5,
+ .small_exp = V2 (0x3c90000000000000),
+ .thres_exp = V2 (0x03f0000000000000),
+ .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */
+ .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */
+ .ln2_lo_n = -0x1.c610ca86c3899p-45,
+};
+
+/* This version implements an algorithm close to scalar pow but
+ - does not implement the trick in the exp's specialcase subroutine to avoid
+ double-rounding,
+ - does not use a tail in the exponential core computation,
+ - and pow's exp polynomial order and table bits might differ.
+
+ Maximum measured error is 1.04 ULPs:
+ _ZGVnN2vv_pow(0x1.024a3e56b3c3p-136, 0x1.87910248b58acp-13)
+ got 0x1.f71162f473251p-1
+ want 0x1.f71162f473252p-1. */
+
+static inline float64x2_t
+v_masked_lookup_f64 (const double *table, uint64x2_t i)
+{
+ return (float64x2_t){
+ table[(i[0] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)],
+ table[(i[1] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)]
+ };
+}
+
+/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
+ additional 15 bits precision. IX is the bit representation of x, but
+ normalized in the subnormal range using the sign bit for the exponent. */
+static inline float64x2_t
+v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+{
+ /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ uint64x2_t tmp = vsubq_u64 (ix, d->offset);
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+ uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+ /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
+ float64x2_t invc = v_masked_lookup_f64 (__v_pow_log_data.invc, tmp);
+ float64x2_t logc = v_masked_lookup_f64 (__v_pow_log_data.logc, tmp);
+ float64x2_t logctail = v_masked_lookup_f64 (__v_pow_log_data.logctail, tmp);
+ /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
+ |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
+ /* k*Ln2 + log(c) + r. */
+ float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
+ float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
+ float64x2_t t2 = vaddq_f64 (t1, r);
+ float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
+ float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
+ float64x2_t ar2 = vmulq_f64 (r, ar);
+ float64x2_t ar3 = vmulq_f64 (r, ar2);
+ /* k*Ln2 + log(c) + r + A[0]*r*r. */
+ float64x2_t hi = vaddq_f64 (t2, ar2);
+ float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
+ float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
+ /* p = log1p(r) - r - A[0]*r*r. */
+ float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
+ float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
+ float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
+ float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
+ float64x2_t p = vfmaq_f64 (a34, ar2, a56);
+ p = vfmaq_f64 (a12, ar2, p);
+ p = vmulq_f64 (ar3, p);
+ float64x2_t lo
+ = vaddq_f64 (vaddq_f64 (vaddq_f64 (vaddq_f64 (lo1, lo2), lo3), lo4), p);
+ float64x2_t y = vaddq_f64 (hi, lo);
+ *tail = vaddq_f64 (vsubq_f64 (hi, y), lo);
+ return y;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+exp_special_case (float64x2_t x, float64x2_t xtail)
+{
+ return (float64x2_t){ exp_nosignbias (x[0], xtail[0]),
+ exp_nosignbias (x[1], xtail[1]) };
+}
+
+/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */
+static inline float64x2_t
+v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
+{
+ /* Fallback to scalar exp_inline for all lanes if any lane
+ contains value of x s.t. |x| <= 2^-54 or >= 512. */
+ uint64x2_t uoflowx = vcgeq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
+ d->thres_exp);
+ if (unlikely (v_any_u64 (uoflowx)))
+ return exp_special_case (x, vnegq_f64 (neg_xtail));
+
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
+ float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
+ float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
+ float64x2_t kd = vrndnq_f64 (z);
+ uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
+ float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
+ float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
+ r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r = vsubq_f64 (r, neg_xtail);
+ /* 2^(k/N) ~= scale. */
+ uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
+ uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ uint64x2_t sbits = v_lookup_u64 (SBits, idx);
+ sbits = vaddq_u64 (sbits, top);
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
+ tmp = vfmaq_f64 (d->exp_c0, r, tmp);
+ tmp = vfmaq_f64 (r, r2, tmp);
+ float64x2_t scale = vreinterpretq_f64_u64 (sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ return vfmaq_f64 (scale, scale, tmp);
+}
+
+static float64x2_t NOINLINE VPCS_ATTR
+scalar_fallback (float64x2_t x, float64x2_t y)
+{
+ return (float64x2_t){ pow_scalar_special_case (x[0], y[0]),
+ pow_scalar_special_case (x[1], y[1]) };
+}
+
+float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+ /* Case of x <= 0 is too complicated to be vectorised efficiently here,
+ fallback to scalar pow for all lanes if any x < 0 detected. */
+ if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x))))
+ return scalar_fallback (x, y);
+
+ uint64x2_t vix = vreinterpretq_u64_f64 (x);
+ uint64x2_t viy = vreinterpretq_u64_f64 (y);
+ uint64x2_t iay = vandq_u64 (viy, d->inf);
+
+ /* Special cases of x or y. */
+#if WANT_SIMD_EXCEPT
+ /* Small or large. */
+ uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
+ uint64x2_t vabstopy = vshrq_n_u64 (iay, 52);
+ uint64x2_t specialx
+ = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
+ uint64x2_t specialy
+ = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
+#else
+ /* The case y==0 does not trigger a special case, since in this case it is
+ necessary to fix the result only if x is a signalling nan, which already
+ triggers a special case. We test y==0 directly in the scalar fallback. */
+ uint64x2_t iax = vandq_u64 (vix, d->inf);
+ uint64x2_t specialx = vcgeq_u64 (iax, d->inf);
+ uint64x2_t specialy = vcgeq_u64 (iay, d->inf);
+#endif
+ uint64x2_t special = vorrq_u64 (specialx, specialy);
+ /* Fallback to scalar on all lanes if any lane is inf or nan. */
+ if (unlikely (v_any_u64 (special)))
+ return scalar_fallback (x, y);
+
+ /* Small cases of x: |x| < 0x1p-126. */
+ uint64x2_t smallx = vcaltq_f64 (x, d->small_powx);
+ if (unlikely (v_any_u64 (smallx)))
+ {
+ /* Update ix if top 12 bits of x are 0. */
+ uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52));
+ if (unlikely (v_any_u64 (sub_x)))
+ {
+ /* Normalize subnormal x so exponent becomes negative. */
+ uint64x2_t vix_norm = vreinterpretq_u64_f64 (
+ vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
+ vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
+ vix = vbslq_u64 (sub_x, vix_norm, vix);
+ }
+ }
+
+ /* Vector Log(ix, &lo). */
+ float64x2_t vlo;
+ float64x2_t vhi = v_log_inline (vix, &vlo, d);
+
+ /* Vector Exp(y_loghi, y_loglo). */
+ float64x2_t vehi = vmulq_f64 (y, vhi);
+ float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
+ float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
+ return v_exp_inline (vehi, neg_velo, d);
+}
+
+TEST_SIG (V, D, 2, pow)
+TEST_ULP (V_NAME_D2 (pow), 0.55)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
+/* Wide intervals spanning the whole domain but shared between x and y. */
+#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
+ TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+#define EXPAND(str) str##000000000
+#define SHL52(str) EXPAND (str)
+V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
+V_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
+V_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
+V_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
+V_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
+V_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
+V_POW_INTERVAL2 (0, inf, 0, inf, 1000)
+/* x~1 or y~1. */
+V_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
+V_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
+V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
+/* around argmaxs of ULP error. */
+V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
+V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
+/* x is negative, y is odd or even integer, or y is real not integer. */
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+/* 1.0^y. */
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c
new file mode 100644
index 000000000000..47f74cf38ab0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c
@@ -0,0 +1,209 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Thresh v_u32 (0x7f000000) /* Max - Min. */
+#define MantissaMask v_u32 (0x007fffff)
+
+#define A d->log2_poly
+#define C d->exp2f_poly
+
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
+#define Off v_u32 (0x3f35d000)
+
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_EXP2F_TABLE_BITS 5
+#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
+#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
+
+static const struct data
+{
+ struct
+ {
+ double invc, logc;
+ } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
+ float64x2_t log2_poly[4];
+ uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
+ float64x2_t exp2f_poly[3];
+} data = {
+ .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
+ {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
+ {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
+ {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
+ {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
+ {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
+ {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
+ {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
+ {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
+ {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
+ {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
+ {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
+ {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
+ {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
+ {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
+ {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
+ {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
+ {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
+ {0x1p+0, 0x0p+0 * Scale},
+ {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
+ {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
+ {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
+ {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
+ {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
+ {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
+ {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
+ {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
+ {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
+ {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
+ {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
+ {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
+ {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
+ .log2_poly = { /* rel err: 1.5 * 2^-30. */
+ V2 (-0x1.6ff5daa3b3d7cp-2 * Scale),
+ V2 (0x1.ec81d03c01aebp-2 * Scale),
+ V2 (-0x1.71547bb43f101p-1 * Scale),
+ V2 (0x1.7154764a815cbp0 * Scale)},
+ .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
+ 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
+ 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
+ 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+ 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
+ 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
+ 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
+ 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+ 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
+ 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
+ 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
+ .exp2f_poly = { /* rel err: 1.69 * 2^-34. */
+ V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale),
+ V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale),
+ V2 (0x1.62e42ff0c52d6p-1 / Scale)}};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
+{
+ return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+static inline float64x2_t
+ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k,
+ float64x2_t invc, float64x2_t logc, float64x2_t y)
+{
+
+ /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc);
+ float64x2_t y0 = vaddq_f64 (logc, k);
+
+ /* Polynomial to approximate log1p(r)/ln2. */
+ float64x2_t logx = vfmaq_f64 (A[1], r, A[0]);
+ logx = vfmaq_f64 (A[2], logx, r);
+ logx = vfmaq_f64 (A[3], logx, r);
+ logx = vfmaq_f64 (y0, logx, r);
+
+ return vmulq_f64 (logx, y);
+}
+
+static inline float64x2_t
+log2_lookup (const struct data *d, uint32_t i)
+{
+ return vld1q_f64 (
+ &d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc);
+}
+
+static inline uint64x1_t
+exp2f_lookup (const struct data *d, uint64_t i)
+{
+ return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]);
+}
+
+static inline float32x2_t
+powf_core (const struct data *d, float64x2_t ylogx)
+{
+ /* N*x = k + r with r in [-1/2, 1/2]. */
+ float64x2_t kd = vrndnq_f64 (ylogx);
+ int64x2_t ki = vcvtaq_s64_f64 (ylogx);
+ float64x2_t r = vsubq_f64 (ylogx, kd);
+
+ /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
+ uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)),
+ exp2f_lookup (d, vgetq_lane_s64 (ki, 1)));
+ t = vaddq_u64 (
+ t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS)));
+ float64x2_t s = vreinterpretq_f64_u64 (t);
+ float64x2_t p = vfmaq_f64 (C[1], r, C[0]);
+ p = vfmaq_f64 (C[2], r, p);
+ p = vfmaq_f64 (s, p, vmulq_f64 (s, r));
+ return vcvt_f32_f64 (p);
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t u = vreinterpretq_u32_f32 (x);
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
+ uint32x4_t tmp = vsubq_u32 (u, Off);
+ uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
+ float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top));
+ int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
+ 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
+
+ /* Use double precision for each lane: split input vectors into lo and hi
+ halves and promote. */
+ float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)),
+ tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)),
+ tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)),
+ tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3));
+
+ float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)),
+ iz_hi = vcvt_high_f64_f32 (iz);
+
+ float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))),
+ k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k));
+
+ float64x2_t invc_lo = vzip1q_f64 (tab0, tab1),
+ invc_hi = vzip1q_f64 (tab2, tab3),
+ logc_lo = vzip2q_f64 (tab0, tab1),
+ logc_hi = vzip2q_f64 (tab2, tab3);
+
+ float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)),
+ y_hi = vcvt_high_f64_f32 (y);
+
+ float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo);
+ float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi);
+
+ uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo),
+ vreinterpretq_u32_f64 (ylogx_hi));
+
+ cmp = vorrq_u32 (
+ cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)),
+ vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS))
+ >> 47)));
+
+ float32x2_t p_lo = powf_core (d, ylogx_lo);
+ float32x2_t p_hi = powf_core (d, ylogx_hi);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp);
+ return vcombine_f32 (p_lo, p_hi);
+}
+
+HALF_WIDTH_ALIAS_F2 (pow)
+
+TEST_SIG (V, F, 2, pow)
+TEST_ULP (V_NAME_F2 (pow), 2.1)
+TEST_DISABLE_FENV (V_NAME_F2 (pow))
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c
new file mode 100644
index 000000000000..0461bbb99405
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c
@@ -0,0 +1,105 @@
+/*
+ * Double-precision vector sin function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "test_defs.h"
+#include "test_sig.h"
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float64x2_t poly[7];
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
+} data = {
+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+
+ .range_val = V2 (0x1p23),
+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
+ .pi_1 = V2 (0x1.921fb54442d18p+1),
+ .pi_2 = V2 (0x1.1a62633145c06p-53),
+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
+};
+
+#if WANT_SIMD_EXCEPT
+/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
+# define TinyBound v_u64 (0x3020000000000000)
+/* RangeVal - TinyBound. */
+# define Thresh v_u64 (0x1160000000000000)
+#endif
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (sin, x, y, cmp);
+}
+
+/* Vector (AdvSIMD) sin approximation.
+ Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
+ is 2.87 ULP:
+ _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
+ want 0x1.fffffffa7dc05p-1
+ Maximum observed error in the entire non-special domain ([-2^23, 2^23])
+ is 3.22 ULP:
+ _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
+ want 0x1.ffdcd125c84f8p-3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
+ uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
+ triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+ fenv). These lanes will be fixed by special-case handler later. */
+ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+ r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
+#else
+ r = x;
+ cmp = vcageq_f64 (x, d->range_val);
+#endif
+
+ /* n = rint(|x|/pi). */
+ n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (r, d->pi_2, n);
+ r = vfmsq_f64 (r, d->pi_3, n);
+
+ /* sin(r) poly approx. */
+ r2 = vmulq_f64 (r, r);
+ r3 = vmulq_f64 (r2, r);
+ r4 = vmulq_f64 (r2, r2);
+
+ t1 = vfmaq_f64 (C (4), C (5), r2);
+ t2 = vfmaq_f64 (C (2), C (3), r2);
+ t3 = vfmaq_f64 (C (0), C (1), r2);
+
+ y = vfmaq_f64 (t1, C (6), r4);
+ y = vfmaq_f64 (t2, y, r4);
+ y = vfmaq_f64 (t3, y, r4);
+ y = vfmaq_f64 (r, y, r3);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+TEST_SIG (V, D, 1, sin, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (sin), 3.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c
new file mode 100644
index 000000000000..83bfa45efa98
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c
@@ -0,0 +1,67 @@
+/*
+ * Double-precision vector sincos function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Define _GNU_SOURCE in order to include sincos declaration. If building
+ pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
+ be linked against the scalar sincosf from math/. */
+#define _GNU_SOURCE
+#include <math.h>
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "v_sincos_common.h"
+
+/* sincos not available for all scalar libm implementations. */
+#if defined(_MSC_VER) || !defined(__GLIBC__)
+static void
+sincos (double x, double *out_sin, double *out_cos)
+{
+ *out_sin = sin (x);
+ *out_cos = cos (x);
+}
+#endif
+
+static void VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, double *out_sin,
+ double *out_cos)
+{
+ if (special[0])
+ sincos (x[0], out_sin, out_cos);
+ if (special[1])
+ sincos (x[1], out_sin + 1, out_cos + 1);
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+VPCS_ATTR void
+_ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos)
+{
+ const struct v_sincos_data *d = ptr_barrier (&v_sincos_data);
+ uint64x2_t special = check_ge_rangeval (x, d);
+
+ float64x2x2_t sc = v_sincos_inline (x, d);
+
+ vst1q_f64 (out_sin, sc.val[0]);
+ vst1q_f64 (out_cos, sc.val[1]);
+
+ if (unlikely (v_any_u64 (special)))
+ special_case (x, special, out_sin, out_cos);
+}
+
+TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin)
+TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
+TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
+#define V_SINCOS_INTERVAL(lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
+V_SINCOS_INTERVAL (0, 0x1p-31, 50000)
+V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000)
+V_SINCOS_INTERVAL (0x1p23, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c
new file mode 100644
index 000000000000..cd482f38d5f6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector sincos function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Define _GNU_SOURCE in order to include sincosf declaration. If building
+ pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
+ be linked against the scalar sincosf from math/. */
+#define _GNU_SOURCE
+#include <math.h>
+
+#include "v_sincosf_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+/* sincos not available for all scalar libm implementations. */
+#if defined(_MSC_VER) || !defined(__GLIBC__)
+static void
+sincosf (float x, float *out_sin, float *out_cos)
+{
+ *out_sin = sinf (x);
+ *out_cos = cosf (x);
+}
+#endif
+
+static void VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, float *out_sin,
+ float *out_cos)
+{
+ for (int i = 0; i < 4; i++)
+ if (special[i])
+ sincosf (x[i], out_sin + i, out_cos + i);
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+VPCS_ATTR void
+_ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos)
+{
+ const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data);
+ uint32x4_t special = check_ge_rangeval (x, d);
+
+ float32x4x2_t sc = v_sincosf_inline (x, d);
+
+ vst1q_f32 (out_sin, sc.val[0]);
+ vst1q_f32 (out_cos, sc.val[1]);
+
+ if (unlikely (v_any_u32 (special)))
+ special_case (x, special, out_sin, out_cos);
+}
+
+TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos)
+TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
+TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
+#define V_SINCOSF_INTERVAL(lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
+V_SINCOSF_INTERVAL (0, 0x1p-31, 50000)
+V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000)
+V_SINCOSF_INTERVAL (0x1p20, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c
new file mode 100644
index 000000000000..fd425202ce67
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c
@@ -0,0 +1,44 @@
+/*
+ * Double-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_sincospi_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using separate argument reduction and shared low-order
+ polynomials.
+ Approximation for vector double-precision sincospi(x).
+ Maximum Error 3.09 ULP:
+ _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+ want 0x1.fd54d0b327cf4p-1
+ Maximum Error 3.16 ULP:
+ _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+ want 0x1.fd2da484ff402p-1. */
+VPCS_ATTR void
+_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos)
+{
+ const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data);
+
+ float64x2x2_t sc = v_sincospi_inline (x, d);
+
+ vst1q_f64 (out_sin, sc.val[0]);
+ vst1q_f64 (out_cos, sc.val[1]);
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin)
+TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59)
+TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66)
+# define V_SINCOSPI_INTERVAL(lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n)
+V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000)
+V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000)
+V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000)
+V_SINCOSPI_INTERVAL (0x1p63, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c
new file mode 100644
index 000000000000..760ea3d4f5e1
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c
@@ -0,0 +1,43 @@
+/*
+ * Single-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincospif_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "mathlib.h"
+
+/* Single-precision vector function allowing calculation of both sinpi and
+ cospi in one function call, using shared argument reduction and polynomials.
+ Worst-case error for sin is 3.04 ULP:
+ _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ Worst-case error for cos is 3.18 ULP:
+ _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ */
+VPCS_ATTR void
+_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos)
+{
+ const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data);
+
+ float32x4x2_t sc = v_sincospif_inline (x, d);
+
+ vst1q_f32 (out_sin, sc.val[0]);
+ vst1q_f32 (out_cos, sc.val[1]);
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos)
+TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54)
+TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68)
+# define V_SINCOSPIF_INTERVAL(lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n)
+V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000)
+V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000)
+V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000)
+V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c
new file mode 100644
index 000000000000..0764434039a0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c
@@ -0,0 +1,92 @@
+/*
+ * Single-precision vector sin function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ float32x4_t poly[4];
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
+} data = {
+ /* 1.886 ulp error. */
+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+ V4 (0x1.5b2e76p-19f) },
+
+ .pi_1 = V4 (0x1.921fb6p+1f),
+ .pi_2 = V4 (-0x1.777a5cp-24f),
+ .pi_3 = V4 (-0x1.ee59dap-49f),
+
+ .inv_pi = V4 (0x1.45f306p-2f),
+ .range_val = V4 (0x1p20f)
+};
+
+#if WANT_SIMD_EXCEPT
+/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
+# define TinyBound v_u32 (0x22000000)
+/* RangeVal - TinyBound. */
+# define Thresh v_u32 (0x27800000)
+#endif
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (sinf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, y;
+ uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
+#else
+ r = x;
+ cmp = vcageq_f32 (x, d->range_val);
+#endif
+
+ /* n = rint(|x|/pi). */
+ n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (r, d->pi_2, n);
+ r = vfmsq_f32 (r, d->pi_3, n);
+
+ /* y = sin(r). */
+ r2 = vmulq_f32 (r, r);
+ y = vfmaq_f32 (C (2), C (3), r2);
+ y = vfmaq_f32 (C (1), y, r2);
+ y = vfmaq_f32 (C (0), y, r2);
+ y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+HALF_WIDTH_ALIAS_F1 (sin)
+
+TEST_SIG (V, F, 1, sin, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (sin), 1.4)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c
new file mode 100644
index 000000000000..f65ccd0c6270
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c
@@ -0,0 +1,80 @@
+/*
+ * Double-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+ struct v_expm1_data d;
+ uint64x2_t halff;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t tiny_bound, thresh;
+#else
+ float64x2_t large_bound;
+#endif
+} data = {
+ .d = V_EXPM1_DATA,
+ .halff = V2 (0x3fe0000000000000),
+#if WANT_SIMD_EXCEPT
+ /* 2^-26, below which sinh(x) rounds to x. */
+ .tiny_bound = V2 (0x3e50000000000000),
+ /* asuint(large_bound) - asuint(tiny_bound). */
+ .thresh = V2 (0x0230000000000000),
+#else
+ /* 2^9. expm1 helper overflows for large input. */
+ .large_bound = V2 (0x1p+9),
+#endif
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x)
+{
+ return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The greatest observed error is 2.52 ULP:
+ _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
+ want -0x1.ac2f05bb66fc9p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ float64x2_t halfsign = vreinterpretq_f64_u64 (
+ vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t special = vcgeq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
+#else
+ uint64x2_t special = vcageq_f64 (x, d->large_bound);
+#endif
+
+ /* Fall back to scalar variant for all lanes if any of them are special. */
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x);
+
+ /* Up to the point that expm1 overflows, we can use it to calculate sinh
+ using a slight rearrangement of the definition of sinh. This allows us to
+ retain acceptable accuracy for very small inputs. */
+ float64x2_t t = expm1_inline (ax, &d->d);
+ t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+ return vmulq_f64 (t, halfsign);
+}
+
+TEST_SIG (V, D, 1, sinh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (sinh), 2.02)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c
new file mode 100644
index 000000000000..12dbe26b425b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+ struct v_expm1f_data expm1f_consts;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound, thresh;
+#else
+ float32x4_t oflow_bound;
+#endif
+} data = {
+ .expm1f_consts = V_EXPM1F_DATA,
+#if WANT_SIMD_EXCEPT
+ /* 0x1.6a09e8p-32, below which expm1f underflows. */
+ .tiny_bound = V4 (0x2fb504f4),
+ /* asuint(oflow_bound) - asuint(tiny_bound). */
+ .thresh = V4 (0x12fbbbb3),
+#else
+ /* 0x1.61814ep+6, above which expm1f helper overflows. */
+ .oflow_bound = V4 (0x1.61814ep+6),
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+ uint32x4_t special)
+{
+ return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
+}
+
+/* Approximation for vector single-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The maximum error is 2.26 ULP:
+ _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
+ want 0x1.e469e4p-4. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t halfsign = vreinterpretq_f32_u32 (
+ vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t special = vcgeq_u32 (
+ vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
+ ax = v_zerofy_f32 (ax, special);
+#else
+ uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
+#endif
+
+ /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+ using a slight rearrangement of the definition of asinh. This allows us
+ to retain acceptable accuracy for very small inputs. */
+ float32x4_t t = expm1f_inline (ax, &d->expm1f_consts);
+ t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
+
+ /* Fall back to the scalar variant for any lanes that should trigger an
+ exception. */
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, t, halfsign, special);
+
+ return vmulq_f32 (t, halfsign);
+}
+
+HALF_WIDTH_ALIAS_F1 (sinh)
+
+TEST_SIG (V, F, 1, sinh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (sinh), 1.76)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c
new file mode 100644
index 000000000000..f86d167a2ac3
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector sinpi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t poly[10];
+} data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+ V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+ V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+ V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+ V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */
+/* asuint64(0x1p64) - TinyBound. */
+# define Thresh v_u64 (0x07f0000000000000)
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (arm_math_sinpi, x, y, cmp);
+}
+#endif
+
+/* Approximation for vector double-precision sinpi(x).
+ Maximum Error 3.05 ULP:
+ _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1
+ want 0x1.fb295878301cap-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+
+ /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
+ to avoid them under/overflowing and throwing exceptions. */
+ float64x2_t r = v_zerofy_f64 (x, cmp);
+#else
+ float64x2_t r = x;
+#endif
+
+ /* If r is odd, the sign of the result should be inverted. */
+ uint64x2_t odd
+ = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
+
+ /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */
+ r = vsubq_f64 (r, vrndaq_f64 (r));
+
+ /* y = sin(r). */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t r4 = vmulq_f64 (r2, r2);
+ float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+#endif
+
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_D1 (sinpi), 2.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c
new file mode 100644
index 000000000000..98ba9d84d2fb
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision vector sinpi function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t poly[6];
+} data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+ V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */
+# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (arm_math_sinpif, x, y, cmp);
+}
+#endif
+
+/* Approximation for vector single-precision sinpi(x)
+ Maximum Error 3.03 ULP:
+ _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
+ want 0x1.f7cd5p-1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+
+ /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
+ to avoid them under/overflowing and throwing exceptions. */
+ float32x4_t r = v_zerofy_f32 (x, cmp);
+#else
+ float32x4_t r = x;
+#endif
+
+ /* If r is odd, the sign of the result should be inverted. */
+ uint32x4_t odd
+ = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
+
+ /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */
+ r = vsubq_f32 (r, vrndaq_f32 (r));
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t r4 = vmulq_f32 (r2, r2);
+ float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+#endif
+
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+HALF_WIDTH_ALIAS_F1 (sinpi)
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_F1 (sinpi), 2.54)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c
new file mode 100644
index 000000000000..957f9aba3a1e
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision vector tan(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t poly[9];
+ double half_pi[2];
+ float64x2_t two_over_pi, shift;
+#if !WANT_SIMD_EXCEPT
+ float64x2_t range_val;
+#endif
+} data = {
+ /* Coefficients generated using FPMinimax. */
+ .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
+ V2 (0x1.ba1ba1bb46414p-5), V2 (0x1.664f47e5b5445p-6),
+ V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9),
+ V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11),
+ V2 (0x1.4e4fd14147622p-12) },
+ .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
+ .two_over_pi = V2 (0x1.45f306dc9c883p-1),
+ .shift = V2 (0x1.8p52),
+#if !WANT_SIMD_EXCEPT
+ .range_val = V2 (0x1p23),
+#endif
+};
+
+#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */
+#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */
+#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */
+
+/* Special cases (fall back to scalar calls). */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x)
+{
+ return v_call_f64 (tan, x, x, v_u64 (-1));
+}
+
+/* Vector approximation for double-precision tan.
+ Maximum measured error is 3.48 ULP:
+ _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+ want -0x1.f6ccd8ecf7deap+37. */
+float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+ /* Our argument reduction cannot calculate q with sufficient accuracy for
+ very large inputs. Fall back to scalar routine for all lanes if any are
+ too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
+ tiny input to avoid underflow. */
+#if WANT_SIMD_EXCEPT
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ /* iax - tiny_bound > range_val - tiny_bound. */
+ uint64x2_t special
+ = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x);
+#endif
+
+ /* q = nearest integer to 2 * x / pi. */
+ float64x2_t q
+ = vsubq_f64 (vfmaq_f64 (dat->shift, x, dat->two_over_pi), dat->shift);
+ int64x2_t qi = vcvtq_s64_f64 (q);
+
+ /* Use q to reduce x to r in [-pi/4, pi/4], by:
+ r = x - q * pi/2, in extended precision. */
+ float64x2_t r = x;
+ float64x2_t half_pi = vld1q_f64 (dat->half_pi);
+ r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+ r = vfmsq_laneq_f64 (r, q, half_pi, 1);
+ /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+ formula. */
+ r = vmulq_n_f64 (r, 0.5);
+
+ /* Approximate tan(r) using order 8 polynomial.
+ tan(x) is odd, so polynomial has the form:
+ tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ...
+ Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
+ Then compute the approximation by:
+ tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */
+ float64x2_t r2 = vmulq_f64 (r, r), r4 = vmulq_f64 (r2, r2),
+ r8 = vmulq_f64 (r4, r4);
+ /* Offset coefficients to evaluate from C1 onwards. */
+ float64x2_t p = v_estrin_7_f64 (r2, r4, r8, dat->poly + 1);
+ p = vfmaq_f64 (dat->poly[0], p, r2);
+ p = vfmaq_f64 (r, r2, vmulq_f64 (p, r));
+
+ /* Recombination uses double-angle formula:
+ tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
+ and reciprocity around pi/2:
+ tan(x) = 1 / (tan(pi/2 - x))
+ to assemble result using change-of-sign and conditional selection of
+ numerator/denominator, dependent on odd/even-ness of q (hence quadrant).
+ */
+ float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p);
+ float64x2_t d = vaddq_f64 (p, p);
+
+ uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
+
+#if !WANT_SIMD_EXCEPT
+ uint64x2_t special = vcageq_f64 (x, dat->range_val);
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x);
+#endif
+
+ return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
+ vbslq_f64 (no_recip, d, n));
+}
+
+TEST_SIG (V, D, 1, tan, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (tan), 2.99)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c
new file mode 100644
index 000000000000..ed5448649f6c
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c
@@ -0,0 +1,130 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t poly[6];
+ float pi_consts[4];
+ float32x4_t shift;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t range_val;
+#endif
+} data = {
+ /* Coefficients generated using FPMinimax. */
+ .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
+ V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) },
+ /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */
+ .pi_consts
+ = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
+ .shift = V4 (0x1.8p+23f),
+#if !WANT_SIMD_EXCEPT
+ .range_val = V4 (0x1p15f),
+#endif
+};
+
+#define RangeVal v_u32 (0x47000000) /* asuint32(0x1p15f). */
+#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f). */
+#define Thresh v_u32 (0x16000000) /* asuint32(RangeVal) - TinyBound. */
+
+/* Special cases (fall back to scalar calls). */
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ return v_call_f32 (tanf, x, y, cmp);
+}
+
+/* Use a full Estrin scheme to evaluate polynomial. */
+static inline float32x4_t
+eval_poly (float32x4_t z, const struct data *d)
+{
+ float32x4_t z2 = vmulq_f32 (z, z);
+#if WANT_SIMD_EXCEPT
+ /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
+ If fp exceptions are to be triggered correctly,
+ sidestep this by fixing such lanes to 0. */
+ uint32x4_t will_uflow
+ = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
+ if (unlikely (v_any_u32 (will_uflow)))
+ z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
+#endif
+ float32x4_t z4 = vmulq_f32 (z2, z2);
+ return v_estrin_5_f32 (z, z2, z4, d->poly);
+}
+
+/* Fast implementation of AdvSIMD tanf.
+ Maximum error is 3.45 ULP:
+ __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+ want 0x1.ff9850p-1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t special_arg = x;
+
+ /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
+ regression. */
+#if WANT_SIMD_EXCEPT
+ uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ /* If fp exceptions are to be triggered correctly, also special-case tiny
+ input, as this will load to overflow later. Fix any special lanes to 1 to
+ prevent any exceptions being triggered. */
+ uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh);
+ if (unlikely (v_any_u32 (special)))
+ x = vbslq_f32 (special, v_f32 (1.0f), x);
+#else
+ /* Otherwise, special-case large and special values. */
+ uint32x4_t special = vcageq_f32 (x, d->range_val);
+#endif
+
+ /* n = rint(x/(pi/2)). */
+ float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
+ float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
+ float32x4_t n = vsubq_f32 (q, d->shift);
+ /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
+ uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
+
+ /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
+ float32x4_t r;
+ r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
+ r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
+ r = vfmaq_laneq_f32 (r, n, pi_consts, 2);
+
+ /* If x lives in an interval, where |tan(x)|
+ - is finite, then use a polynomial approximation of the form
+ tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+ - grows to infinity then use symmetries of tangent and the identity
+ tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+ the same polynomial approximation of tan as above. */
+
+ /* Invert sign of r if odd quadrant. */
+ float32x4_t z = vmulq_f32 (r, vbslq_f32 (pred_alt, v_f32 (-1), v_f32 (1)));
+
+ /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */
+ float32x4_t z2 = vmulq_f32 (r, r);
+ float32x4_t p = eval_poly (z2, d);
+ float32x4_t y = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+ /* Compute reciprocal and apply if required. */
+ float32x4_t inv_y = vdivq_f32 (v_f32 (1.0f), y);
+
+ if (unlikely (v_any_u32 (special)))
+ return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special);
+ return vbslq_f32 (pred_alt, inv_y, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (tan)
+
+TEST_SIG (V, F, 1, tan, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (tan), 2.96)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c
new file mode 100644
index 000000000000..3dc6e5527ffc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c
@@ -0,0 +1,67 @@
+/*
+ * Double-precision vector tanh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+ struct v_expm1_data d;
+ uint64x2_t thresh, tiny_bound;
+} data = {
+ .d = V_EXPM1_DATA,
+ .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
+ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
+ .thresh = V2 (0x01f241bf835f9d5f),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
+ uint64x2_t special)
+{
+ return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+ version of expm1. The greatest observed error is 2.70 ULP:
+ _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
+ want -0x1.be5452a6459fbp-3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+ float64x2_t u = x;
+
+ /* Trigger special-cases for tiny, boring and infinity/NaN. */
+ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
+#if WANT_SIMD_EXCEPT
+ /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+ They will be fixed up later by the special-case handler. */
+ if (unlikely (v_any_u64 (special)))
+ u = v_zerofy_f64 (u, special);
+#endif
+
+ u = vaddq_f64 (u, u);
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ float64x2_t q = expm1_inline (u, &d->d);
+ float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
+
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, q, qp2, special);
+ return vdivq_f64 (q, qp2);
+}
+
+TEST_SIG (V, D, 1, tanh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (tanh), 2.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c
new file mode 100644
index 000000000000..18fe93c7e7ba
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c
@@ -0,0 +1,81 @@
+/*
+ * Single-precision vector tanh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+ struct v_expm1f_data expm1f_consts;
+ uint32x4_t boring_bound, large_bound, onef;
+} data = {
+ .expm1f_consts = V_EXPM1F_DATA,
+ /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
+ .boring_bound = V4 (0x41102cb3),
+ .large_bound = V4 (0x7f800000),
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+ float32x4_t q, uint32x4_t special)
+{
+ return v_call_f32 (
+ tanhf, x,
+ vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+ special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified
+ version of expm1f. The maximum error is 2.58 ULP:
+ _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
+ want 0x1.f9ba08p-5. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t sign = veorq_u32 (ix, iax);
+ uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
+ /* expm1 exponent bias is 1.0f reinterpreted to int. */
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+ sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
+
+#if WANT_SIMD_EXCEPT
+ /* If fp exceptions are to be triggered properly, set all special and boring
+ lanes to 0, which will trigger no exceptions, and fix them up later. */
+ uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound),
+ vcltq_u32 (iax, v_u32 (0x34000000)));
+ x = v_zerofy_f32 (x, is_boring);
+ if (unlikely (v_any_u32 (special)))
+ x = v_zerofy_f32 (x, special);
+#else
+ uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
+#endif
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+
+ if (unlikely (v_any_u32 (special)))
+ return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+ special);
+
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+ return vbslq_f32 (is_boring, boring, y);
+}
+
+HALF_WIDTH_ALIAS_F1 (tanh)
+
+TEST_SIG (V, F, 1, tanh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (tanh), 2.09)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c
new file mode 100644
index 000000000000..16de00ad5556
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c
@@ -0,0 +1,88 @@
+/*
+ * Double-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpi_data
+{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12;
+ double c1, c3, c5, c7, c9, c11, c13, c14;
+} tanpi_data = {
+ /* Coefficents for tan(pi * x) computed with fpminimax
+ on [ 0x1p-1022 0x1p-2 ]
+ approx rel error: 0x1.7eap-55
+ approx abs error: 0x1.7eap-55. */
+ .c0 = V2 (0x1.921fb54442d18p1), /* pi. */
+ .c1 = 0x1.4abbce625be52p3, .c2 = V2 (0x1.466bc6775b0f9p5),
+ .c3 = 0x1.45fff9b426f5ep7, .c4 = V2 (0x1.45f4730dbca5cp9),
+ .c5 = 0x1.45f3265994f85p11, .c6 = V2 (0x1.45f4234b330cap13),
+ .c7 = 0x1.45dca11be79ebp15, .c8 = V2 (0x1.47283fc5eea69p17),
+ .c9 = 0x1.3a6d958cdefaep19, .c10 = V2 (0x1.927896baee627p21),
+ .c11 = -0x1.89333f6acd922p19, .c12 = V2 (0x1.5d4e912bb8456p27),
+ .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32,
+};
+
+/* Approximation for double-precision vector tanpi(x)
+ The maximum error is 3.06 ULP:
+ _ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3
+ want -0x1.fa30112702c95p+3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x)
+{
+ const struct v_tanpi_data *d = ptr_barrier (&tanpi_data);
+
+ float64x2_t n = vrndnq_f64 (x);
+
+ /* inf produces nan that propagates. */
+ float64x2_t xr = vsubq_f64 (x, n);
+ float64x2_t ar = vabdq_f64 (x, n);
+ uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25));
+ float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar);
+
+ /* Order-14 pairwise Horner. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t r4 = vmulq_f64 (r2, r2);
+
+ float64x2_t c_1_3 = vld1q_f64 (&d->c1);
+ float64x2_t c_5_7 = vld1q_f64 (&d->c5);
+ float64x2_t c_9_11 = vld1q_f64 (&d->c9);
+ float64x2_t c_13_14 = vld1q_f64 (&d->c13);
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1);
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0);
+
+ float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1);
+ p = vfmaq_f64 (p1011, r4, p);
+ p = vfmaq_f64 (p89, r4, p);
+ p = vfmaq_f64 (p67, r4, p);
+ p = vfmaq_f64 (p45, r4, p);
+ p = vfmaq_f64 (p23, r4, p);
+ p = vfmaq_f64 (p01, r4, p);
+ p = vmulq_f64 (r, p);
+
+ float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p);
+ float64x2_t y = vbslq_f64 (flip, p_recip, p);
+
+ uint64x2_t sign
+ = veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar));
+ return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (V_NAME_D1 (tanpi))
+TEST_ULP (V_NAME_D1 (tanpi), 2.57)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c
new file mode 100644
index 000000000000..7bd6d206819f
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c
@@ -0,0 +1,70 @@
+/*
+ * Single-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpif_data
+{
+ float32x4_t c0, c2, c4, c6;
+ float c1, c3, c5, c7;
+} tanpif_data = {
+ /* Coefficents for tan(pi * x). */
+ .c0 = V4 (0x1.921fb4p1f), .c1 = 0x1.4abbcep3f, .c2 = V4 (0x1.466b8p5f),
+ .c3 = 0x1.461c72p7f, .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f,
+ .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f,
+};
+
+/* Approximation for single-precision vector tanpi(x)
+ The maximum error is 3.34 ULP:
+ _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2
+ want 0x1.f70aa6p+2. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x)
+{
+ const struct v_tanpif_data *d = ptr_barrier (&tanpif_data);
+
+ float32x4_t n = vrndnq_f32 (x);
+
+ /* inf produces nan that propagates. */
+ float32x4_t xr = vsubq_f32 (x, n);
+ float32x4_t ar = vabdq_f32 (x, n);
+ uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f));
+ float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar);
+
+ /* Order-7 pairwise Horner polynomial evaluation scheme. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t r4 = vmulq_f32 (r2, r2);
+
+ float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3);
+ float32x4_t p = vfmaq_f32 (p45, r4, p67);
+ p = vfmaq_f32 (p23, r4, p);
+ p = vfmaq_f32 (p01, r4, p);
+
+ p = vmulq_f32 (r, p);
+ float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p);
+ float32x4_t y = vbslq_f32 (flip, p_recip, p);
+
+ uint32x4_t sign
+ = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar));
+ return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign));
+}
+
+HALF_WIDTH_ALIAS_F1 (tanpi)
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (V_NAME_F1 (tanpi))
+TEST_ULP (V_NAME_F1 (tanpi), 2.84)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000)
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h
new file mode 100644
index 000000000000..797d217820c3
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h
@@ -0,0 +1,58 @@
+/*
+ * Helper for single-precision routines which calculate exp(ax) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPF_INLINE_H
+#define MATH_V_EXPF_INLINE_H
+
+#include "v_math.h"
+
+struct v_expf_data
+{
+ float ln2_hi, ln2_lo, c0, c2;
+ float32x4_t inv_ln2, c1, c3, c4;
+ /* asuint(1.0f). */
+ uint32x4_t exponent_bias;
+};
+
+/* maxerr: 1.45358 +0.5 ulp. */
+#define V_EXPF_DATA \
+ { \
+ .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \
+ .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \
+ }
+
+static inline float32x4_t
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+{
+ /* Helper routine for calculating exp(ax).
+ Copied from v_expf.c, with all special-case handling removed - the
+ calling routine should handle special values if required. */
+
+ /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
+ float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+ /* Custom order-4 Estrin avoids building high order monomial. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (d->c4, r);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
+ return vfmaq_f32 (scale, poly, scale);
+}
+
+#endif // MATH_V_EXPF_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h
new file mode 100644
index 000000000000..82d2e9415d93
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h
@@ -0,0 +1,86 @@
+/*
+ * Helper for double-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPM1_INLINE_H
+#define MATH_V_EXPM1_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1_data
+{
+ float64x2_t c2, c4, c6, c8;
+ float64x2_t invln2;
+ int64x2_t exponent_bias;
+ double c1, c3, c5, c7, c9, c10;
+ double ln2[2];
+};
+
+/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
+#define V_EXPM1_DATA \
+ { \
+ .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \
+ .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \
+ .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \
+ .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \
+ .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \
+ .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \
+ .invln2 = V2 (0x1.71547652b82fep0), \
+ .exponent_bias = V2 (0x3ff0000000000000), \
+ }
+
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct v_expm1_data *d)
+{
+ /* Helper routine for calculating exp(x) - 1. */
+
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
+ int64x2_t i = vcvtq_s64_f64 (n);
+ float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+ f = vfmsq_laneq_f64 (f, n, ln2, 1);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
+ float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
+ float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
+ float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
+ float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
+ float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
+ p = vfmaq_f64 (p47, f4, p);
+ p = vfmaq_f64 (p03, f4, p);
+
+ p = vfmaq_f64 (f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+ float64x2_t t = vreinterpretq_f64_s64 (u);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
+#endif // MATH_V_EXPM1_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h
new file mode 100644
index 000000000000..463b07aa7705
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h
@@ -0,0 +1,62 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPM1F_INLINE_H
+#define MATH_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1f_data
+{
+ float32x4_t c0, c2;
+ int32x4_t exponent_bias;
+ float c1, c3, inv_ln2, c4;
+ float ln2_hi, ln2_lo;
+};
+
+/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+ log(2)/2]. Exponent bias is asuint(1.0f). */
+#define V_EXPM1F_DATA \
+ { \
+ .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ }
+
+static inline float32x4_t
+expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+{
+ /* Helper routine for calculating exp(x) - 1. */
+
+ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+ float32x4_t lane_consts = vld1q_f32 (&d->c1);
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
+ int32x4_t i = vcvtq_s32_f32 (j);
+ float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+ f = vfmsq_lane_f32 (f, j, ln2, 1);
+
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
+ float32x4_t f2 = vmulq_f32 (f, f);
+ float32x4_t f4 = vmulq_f32 (f2, f2);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+ float32x4_t p = vfmaq_f32 (p01, f2, p23);
+ p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
+ p = vfmaq_f32 (f, f2, p);
+
+ /* t = 2^i. */
+ int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+ float32x4_t t = vreinterpretq_f32_s32 (u);
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+}
+
+#endif // MATH_V_EXPM1F_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h
new file mode 100644
index 000000000000..ef906ae4b603
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h
@@ -0,0 +1,119 @@
+/*
+ * Helper for vector double-precision routines which calculate log(1 + x) and
+ * do not need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef MATH_V_LOG1P_INLINE_H
+#define MATH_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+
+struct v_log1p_data
+{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
+ uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+ int64x2_t one_top;
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+ double ln2[2];
+};
+
+/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
+#define V_LOG1P_CONSTANTS_TABLE \
+ { \
+ .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \
+ .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \
+ .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \
+ .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \
+ .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \
+ .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \
+ .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \
+ .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \
+ .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \
+ .c18 = -0x1.cfa7385bdb37ep-6, \
+ .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \
+ .hf_rt2_top = V2 (0x3fe6a09e00000000), \
+ .one_m_hf_rt2_top = V2 (0x00095f6200000000), \
+ .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
+ }
+
+#define BottomMask v_u64 (0xffffffff)
+
+static inline float64x2_t
+eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1718 = vld1q_f64 (&d->c17);
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
+ float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
+ p = vfmaq_f64 (p1415, m2, p);
+ p = vfmaq_f64 (p1213, m2, p);
+ p = vfmaq_f64 (p1011, m2, p);
+ p = vfmaq_f64 (p89, m2, p);
+ p = vfmaq_f64 (p67, m2, p);
+ p = vfmaq_f64 (p45, m2, p);
+ p = vfmaq_f64 (p23, m2, p);
+ return vfmaq_f64 (p01, m2, p);
+}
+
+static inline float64x2_t
+log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+{
+ /* Helper for calculating log(x + 1):
+ - No special-case handling - this should be dealt with by the caller.
+ - Optionally simulate the shortcut for k=0, used in the scalar routine,
+ using v_sel, for improved accuracy when the argument to log1p is close
+ to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
+ in the source of the caller before including this file. */
+ float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
+ uint64x2_t mi = vreinterpretq_u64_f64 (m);
+ uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+
+ int64x2_t ki
+ = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+ float64x2_t k = vcvtq_f64_s64 (ki);
+
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
+ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+ uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
+
+ /* Correction term c/m. */
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+# error \
+ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+ /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+ that the approximation is solely the polynomial. */
+ uint64x2_t k0 = vceqzq_f64 (k);
+ cm = v_zerofy_f64 (cm, k0);
+ f = vbslq_f64 (k0, x, f);
+#endif
+
+ /* Approximate log1p(f) on the reduced input using a polynomial. */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t p = eval_poly (f, f2, d);
+
+ /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+ float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
+ float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
+ return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
+}
+
+#endif // MATH_V_LOG1P_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h
new file mode 100644
index 000000000000..e81fa24486ae
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h
@@ -0,0 +1,94 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_LOG1PF_INLINE_H
+#define MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+
+struct v_log1pf_data
+{
+ uint32x4_t four;
+ int32x4_t three_quarters;
+ float c0, c3, c5, c7;
+ float32x4_t c4, c6, c1, c2, ln2;
+};
+
+/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+ (1, -0.5) are not stored as they can be generated more efficiently. */
+#define V_LOG1PF_CONSTANTS_TABLE \
+ { \
+ .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
+ .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
+ .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
+ .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
+ .three_quarters = V4 (0x3f400000) \
+ }
+
+static inline float32x4_t
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
+ float32x4_t c0357 = vld1q_f32 (&d->c0);
+ float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
+ float32x4_t m2 = vmulq_f32 (m, m);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+ float32x4_t p = vfmaq_f32 (p45, m2, p67);
+ p = vfmaq_f32 (p23, m2, p);
+ p = vfmaq_f32 (d->c1, m, p);
+ p = vmulq_f32 (m2, p);
+ p = vfmaq_f32 (m, m2, p);
+ return vfmaq_f32 (p, m2, q);
+}
+
+static inline float32x4_t
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
+{
+ /* Helper for calculating log(x + 1). */
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
+ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
+ int32x4_t k
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+ v_s32 (0xff800000));
+ uint32x4_t ku = vreinterpretq_u32_s32 (k);
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+ /* Scale x by exponent manipulation. */
+ float32x4_t m_scale
+ = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+ m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+
+ /* Evaluate polynomial on the reduced interval. */
+ float32x4_t p = eval_poly (m_scale, d);
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
+ float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+
+ /* Apply the scaling back. */
+ return vfmaq_f32 (p, scale_back, d->ln2);
+}
+
+#endif // MATH_V_LOG1PF_INLINE_H
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h
new file mode 100644
index 000000000000..770f9e81c195
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h
@@ -0,0 +1,104 @@
+/*
+ * Double-precision vector log(x) function - inline version
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "math_config.h"
+
+#ifndef V_LOG_INLINE_POLY_ORDER
+# error Cannot use inline log helper without specifying poly order (options are 4 or 5)
+#endif
+
+#if V_LOG_INLINE_POLY_ORDER == 4
+# define POLY \
+ { \
+ V2 (-0x1.ffffffffcbad3p-2), V2 (0x1.555555578ed68p-2), \
+ V2 (-0x1.0000d3a1e7055p-2), V2 (0x1.999392d02a63ep-3) \
+ }
+#elif V_LOG_INLINE_POLY_ORDER == 5
+# define POLY \
+ { \
+ V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), \
+ V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), \
+ V2 (-0x1.554e550bd501ep-3) \
+ }
+#else
+# error Can only choose order 4 or 5 for log poly
+#endif
+
+struct v_log_inline_data
+{
+ float64x2_t poly[V_LOG_INLINE_POLY_ORDER];
+ float64x2_t ln2;
+ uint64x2_t off, sign_exp_mask;
+};
+
+#define V_LOG_CONSTANTS \
+ { \
+ .poly = POLY, .ln2 = V2 (0x1.62e42fefa39efp-1), \
+ .sign_exp_mask = V2 (0xfff0000000000000), .off = V2 (0x3fe6900900000000) \
+ }
+
+#define A(i) d->poly[i]
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+log_lookup (uint64x2_t i)
+{
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static inline float64x2_t
+v_log_inline (float64x2_t x, const struct v_log_inline_data *d)
+{
+ float64x2_t z, r, r2, p, y, kd, hi;
+ uint64x2_t ix, iz, tmp;
+ int64x2_t k;
+ struct entry e;
+
+ ix = vreinterpretq_u64_f64 (x);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ tmp = vsubq_u64 (ix, d->off);
+ k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
+ iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+ z = vreinterpretq_f64_u64 (iz);
+ e = log_lookup (tmp);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ kd = vcvtq_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ r2 = vmulq_f64 (r, r);
+ y = vfmaq_f64 (A (2), A (3), r);
+ p = vfmaq_f64 (A (0), A (1), r);
+#if V_LOG_POLY_ORDER == 5
+ y = vfmaq_f64 (y, A (4), r2);
+#endif
+ y = vfmaq_f64 (p, y, r2);
+
+ return vfmaq_f64 (hi, y, r2);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h
new file mode 100644
index 000000000000..75cd71cc87a7
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h
@@ -0,0 +1,202 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#if !__aarch64__
+# error "Cannot build without AArch64"
+#endif
+
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f
+#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun
+
+#if USE_GLIBC_ABI
+
+# define HALF_WIDTH_ALIAS_F1(fun) \
+ float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \
+ { \
+ return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \
+ }
+
+# define HALF_WIDTH_ALIAS_F2(fun) \
+ float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \
+ { \
+ return vget_low_f32 ( \
+ _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \
+ }
+
+#else
+# define HALF_WIDTH_ALIAS_F1(fun)
+# define HALF_WIDTH_ALIAS_F2(fun)
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+#include <arm_neon.h>
+
+/* Shorthand helpers for declaring constants. */
+#define V2(X) \
+ { \
+ X, X \
+ }
+#define V4(X) \
+ { \
+ X, X, X, X \
+ }
+#define V8(X) \
+ { \
+ X, X, X, X, X, X, X, X \
+ }
+
+static inline int
+v_any_u16h (uint16x4_t x)
+{
+ return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
+}
+
+static inline int
+v_lanes32 (void)
+{
+ return 4;
+}
+
+static inline float32x4_t
+v_f32 (float x)
+{
+ return (float32x4_t) V4 (x);
+}
+static inline uint32x4_t
+v_u32 (uint32_t x)
+{
+ return (uint32x4_t) V4 (x);
+}
+static inline int32x4_t
+v_s32 (int32_t x)
+{
+ return (int32x4_t) V4 (x);
+}
+
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u32 (uint32x4_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+static inline int
+v_any_u32h (uint32x2_t x)
+{
+ return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
+}
+static inline float32x4_t
+v_lookup_f32 (const float *tab, uint32x4_t idx)
+{
+ return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
+}
+static inline uint32x4_t
+v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
+{
+ return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
+}
+static inline float32x4_t
+v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
+{
+ return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+ p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] };
+}
+static inline float32x4_t
+v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
+ float32x4_t y, uint32x4_t p)
+{
+ return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0],
+ p[1] ? f (x1[1], x2[1]) : y[1],
+ p[2] ? f (x1[2], x2[2]) : y[2],
+ p[3] ? f (x1[3], x2[3]) : y[3] };
+}
+static inline float32x4_t
+v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
+{
+ return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
+}
+
+static inline int
+v_lanes64 (void)
+{
+ return 2;
+}
+static inline float64x2_t
+v_f64 (double x)
+{
+ return (float64x2_t) V2 (x);
+}
+static inline uint64x2_t
+v_u64 (uint64_t x)
+{
+ return (uint64x2_t) V2 (x);
+}
+static inline int64x2_t
+v_s64 (int64_t x)
+{
+ return (int64x2_t) V2 (x);
+}
+
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u64 (uint64x2_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (x) != 0;
+}
+static inline float64x2_t
+v_lookup_f64 (const double *tab, uint64x2_t idx)
+{
+ return (float64x2_t){ tab[idx[0]], tab[idx[1]] };
+}
+static inline uint64x2_t
+v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
+{
+ return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
+}
+static inline float64x2_t
+v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
+{
+ double p1 = p[1];
+ double x1 = x[1];
+ if (likely (p[0]))
+ y[0] = f (x[0]);
+ if (likely (p1))
+ y[1] = f (x1);
+ return y;
+}
+
+static inline float64x2_t
+v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
+ float64x2_t y, uint64x2_t p)
+{
+ double p1 = p[1];
+ double x1h = x1[1];
+ double x2h = x2[1];
+ if (likely (p[0]))
+ y[0] = f (x1[0], x2[0]);
+ if (likely (p1))
+ y[1] = f (x1h, x2h);
+ return y;
+}
+static inline float64x2_t
+v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
+{
+ return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
+}
+
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h
new file mode 100644
index 000000000000..9a9c5c1ac15b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h
@@ -0,0 +1,24 @@
+/*
+ * Helpers for evaluating polynomials on single-precision AdvSIMD input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_POLY_ADVSIMD_F32_H
+#define MATH_POLY_ADVSIMD_F32_H
+
+#include <arm_neon.h>
+
+/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form:
+ v_[scheme]_[order]_f32. */
+#define VTYPE float32x4_t
+#define FMA(x, y, z) vfmaq_f32 (z, x, y)
+#define VWRAP(f) v_##f##_f32
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h
new file mode 100644
index 000000000000..4331bfbd03b0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h
@@ -0,0 +1,24 @@
+/*
+ * Helpers for evaluating polynomials on double-precision AdvSIMD input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_POLY_ADVSIMD_F64_H
+#define MATH_POLY_ADVSIMD_F64_H
+
+#include <arm_neon.h>
+
+/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form:
+ v_[scheme]_[order]_f64. */
+#define VTYPE float64x2_t
+#define FMA(x, y, z) vfmaq_f64 (z, x, y)
+#define VWRAP(f) v_##f##_f64
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h
new file mode 100644
index 000000000000..14227d9339a8
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h
@@ -0,0 +1,86 @@
+/*
+ * Core approximation for double-precision vector sincos
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "v_poly_f64.h"
+
+static const struct v_sincos_data
+{
+ float64x2_t sin_poly[7], cos_poly[6], pio2[3];
+ float64x2_t inv_pio2, shift, range_val;
+} v_sincos_data = {
+ .inv_pio2 = V2 (0x1.45f306dc9c882p-1),
+ .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26),
+ V2 (0x1.1a62633145c07p-54) },
+ .shift = V2 (0x1.8p52),
+ .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */
+ V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+ .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */
+ V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10),
+ V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22),
+ V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) },
+ .range_val = V2 (0x1p23), };
+
+static inline uint64x2_t
+check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d)
+{
+ return vcagtq_f64 (x, d->range_val);
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+static inline float64x2x2_t
+v_sincos_inline (float64x2_t x, const struct v_sincos_data *d)
+{
+ /* q = nearest integer to 2 * x / pi. */
+ float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift);
+ int64x2_t n = vcvtq_s64_f64 (q);
+
+ /* Use q to reduce x to r in [-pi/4, pi/4], by:
+ r = x - q * pi/2, in extended precision. */
+ float64x2_t r = x;
+ r = vfmsq_f64 (r, q, d->pio2[0]);
+ r = vfmsq_f64 (r, q, d->pio2[1]);
+ r = vfmsq_f64 (r, q, d->pio2[2]);
+
+ float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2;
+
+ /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */
+ float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly);
+ s = vfmaq_f64 (r, r3, s);
+
+ /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */
+ float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly);
+ c = vfmaq_f64 (v_f64 (-0.5), r2, c);
+ c = vfmaq_f64 (v_f64 (1), r2, c);
+
+ /* If odd quadrant, swap cos and sin. */
+ uint64x2_t swap = vtstq_s64 (n, v_s64 (1));
+ float64x2_t ss = vbslq_f64 (swap, c, s);
+ float64x2_t cc = vbslq_f64 (swap, s, c);
+
+ /* Fix signs according to quadrant.
+ ss = asdouble(asuint64(ss) ^ ((n & 2) << 62))
+ cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */
+ uint64x2_t sin_sign
+ = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62);
+ uint64x2_t cos_sign = vshlq_n_u64 (
+ vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)),
+ 62);
+ ss = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign));
+ cc = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign));
+
+ return (float64x2x2_t){ ss, cc };
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h
new file mode 100644
index 000000000000..7c29eded14d6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h
@@ -0,0 +1,84 @@
+/*
+ * Core approximation for single-precision vector sincos
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+const static struct v_sincosf_data
+{
+ float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val;
+} v_sincosf_data = {
+ .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */
+ V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) },
+ .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */
+ V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) },
+ .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) },
+ .inv_pio2 = V4 (0x1.45f306p-1f),
+ .shift = V4 (0x1.8p23),
+ .range_val = V4 (0x1p20),
+};
+
+static inline uint32x4_t
+check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d)
+{
+ return vcagtq_f32 (x, d->range_val);
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+static inline float32x4x2_t
+v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d)
+{
+ /* n = rint ( x / (pi/2) ). */
+ float32x4_t shift = d->shift;
+ float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2);
+ q = vsubq_f32 (q, shift);
+ int32x4_t n = vcvtq_s32_f32 (q);
+
+ /* Reduce x such that r is in [ -pi/4, pi/4 ]. */
+ float32x4_t r = x;
+ r = vfmsq_f32 (r, q, d->pio2[0]);
+ r = vfmsq_f32 (r, q, d->pio2[1]);
+ r = vfmsq_f32 (r, q, d->pio2[2]);
+
+ /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */
+ float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2);
+ float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]);
+ s = vfmaq_f32 (d->poly_sin[0], r2, s);
+ s = vfmaq_f32 (r, r3, s);
+
+ /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */
+ float32x4_t r4 = vmulq_f32 (r2, r2);
+ float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]);
+ float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]);
+ c = vfmaq_f32 (c, r4, p);
+ c = vfmaq_f32 (v_f32 (1), c, r2);
+
+ /* If odd quadrant, swap cos and sin. */
+ uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1));
+ float32x4_t ss = vbslq_f32 (swap, c, s);
+ float32x4_t cc = vbslq_f32 (swap, s, c);
+
+ /* Fix signs according to quadrant.
+ ss = asfloat(asuint(ss) ^ ((n & 2) << 30))
+ cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */
+ uint32x4_t sin_sign
+ = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30);
+ uint32x4_t cos_sign = vshlq_n_u32 (
+ vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)),
+ 30);
+ ss = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign));
+ cc = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign));
+
+ return (float32x4x2_t){ ss, cc };
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h
new file mode 100644
index 000000000000..438b141b9174
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h
@@ -0,0 +1,64 @@
+/*
+ * Helper for Double-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "v_poly_f64.h"
+
+static const struct v_sincospi_data
+{
+ float64x2_t poly[10], range_val;
+} v_sincospi_data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+ V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+ V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+ V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+ V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+ .range_val = V2 (0x1p63),
+};
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using separate argument reduction and shared low-order
+ polynomials.
+ Approximation for vector double-precision sincospi(x).
+ Maximum Error 3.09 ULP:
+ _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+ want 0x1.fd54d0b327cf4p-1
+ Maximum Error 3.16 ULP:
+ _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+ want 0x1.fd2da484ff402p-1. */
+static inline float64x2x2_t
+v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d)
+{
+ /* If r is odd, the sign of the result should be inverted for sinpi
+ and reintroduced for cospi. */
+ uint64x2_t cmp = vcgeq_f64 (x, d->range_val);
+ uint64x2_t odd = vshlq_n_u64 (
+ vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63);
+
+ /* r = x - rint(x). */
+ float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x));
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr));
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ float64x2_t sr2 = vmulq_f64 (sr, sr);
+ float64x2_t sr4 = vmulq_f64 (sr2, sr2);
+ float64x2_t cr2 = vmulq_f64 (cr, cr);
+ float64x2_t cr4 = vmulq_f64 (cr2, cr2);
+
+ float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr);
+ float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr);
+
+ float64x2_t sinpix
+ = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd));
+
+ float64x2_t cospix
+ = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd));
+
+ return (float64x2x2_t){ sinpix, cospix };
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h
new file mode 100644
index 000000000000..8d4177dd871e
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h
@@ -0,0 +1,57 @@
+/*
+ * Helper for Single-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f32.h"
+
+const static struct v_sincospif_data
+{
+ float32x4_t poly[6], range_val;
+} v_sincospif_data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+ V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+ .range_val = V4 (0x1p31f),
+};
+
+/* Single-precision vector function allowing calculation of both sinpi and
+ cospi in one function call, using shared argument reduction and polynomials.
+ Worst-case error for sin is 3.04 ULP:
+ _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ Worst-case error for cos is 3.18 ULP:
+ _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ */
+static inline float32x4x2_t
+v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d)
+{
+ /* If r is odd, the sign of the result should be inverted for sinpi and
+ reintroduced for cospi. */
+ uint32x4_t cmp = vcgeq_f32 (x, d->range_val);
+ uint32x4_t odd = vshlq_n_u32 (
+ vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31);
+
+ /* r = x - rint(x). */
+ float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x));
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr));
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ float32x4_t sr2 = vmulq_f32 (sr, sr);
+ float32x4_t sr4 = vmulq_f32 (sr2, sr2);
+ float32x4_t cr2 = vmulq_f32 (cr, cr);
+ float32x4_t cr4 = vmulq_f32 (cr2, cr2);
+
+ float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr);
+ float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr);
+
+ float32x4_t sinpix
+ = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd));
+ float32x4_t cospix
+ = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd));
+
+ return (float32x4x2_t){ sinpix, cospix };
+}